class AgentDDPG: def __init__(self, state_size, action_size, seed): """ :state_size: size of the state vector :action_size: size of the action vector """ self.state_size = state_size self.action_size = action_size self.t_step = 0 self.score = 0.0 self.best = 0.0 self.seed = seed self.total_reward = 0.0 self.count = 0 self.learning_rate_actor = 0.0001 self.learning_rate_critic = 0.001 self.batch_size = 128 self.update_every = 1 # Instances of the policy function or actor and the value function or critic # Actor critic with Advantage # Actor local and target network definitions self.actor_local = Actor(self.state_size, self.action_size, self.seed).to(device) self.actor_target = Actor(self.state_size, self.action_size, self.seed).to(device) # Critic local and target self.critic_local = Critic(self.state_size, self.action_size, self.seed).to(device) self.critic_target = Critic(self.state_size, self.action_size, self.seed).to(device) # Actor Optimizer self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.learning_rate_actor) # Critic Optimizer self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.learning_rate_critic) # Make sure local and target start with the same weights self.actor_target.load_state_dict(self.actor_local.state_dict()) self.critic_target.load_state_dict(self.critic_local.state_dict()) # Initialize the Gaussin Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Initialize the Replay Memory self.buffer_size = 1000000 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Parameters for the Algorithm self.gamma = 0.99 # Discount factor self.tau = 0.001 # Soft update for target parameters Actor Critic with Advantage # Actor interact with the environment through the step def step(self, state, action, reward, next_state, done): # Add to the total reward the reward of this time step self.total_reward += reward # Increase your count based on the number of rewards # received in the episode self.count += 1 # Stored experience tuple in the replay buffer self.memory.add(state, action, reward, next_state, done) # Learn every update_times time steps. self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # Check to see if you have enough to produce a batch # and learn from it if len(self.memory) > self.batch_size: experiences = self.memory.sample() # Train the networks using the experiences self.learn(experiences) # Roll over last state action (not needed) # self.last_state = next_state # Actor determines what to do based on the policy def act(self, state): # Given a state return the action recommended by the policy # Reshape the state to fit the torch tensor input state = torch.from_numpy(state).float().unsqueeze(0).to(device) # Pass the state to the actor local model to get an action # recommend for the policy in a state # set the actor_local model to predict not to train self.actor_local.eval() # set the model so this operation is not counted in the # gradiant calculation. with torch.no_grad(): actions = self.actor_local(state) # set the model back to training mode self.actor_local.train() # Because we are exploring we add some noise to the # action vector return list(actions.detach().numpy().reshape(4, ) + self.noise.sample()) # This is the Actor learning logic called when the agent # take a step to learn def learn(self, experiences): """ Learning means that the networks parameters needs to be updated Using the experineces batch. Network learns from experiences not form interaction with the environment """ # Reshape the experience tuples in separate arrays of states, actions # rewards, next_state, done # Your are converting every member of the tuple in a column or vector states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Now reshape the numpy arrays for states, actions and next_states to torch tensors # rewards and dones does not need to be tensors. states = torch.from_numpy(states).float().unsqueeze(0).to(device) actions = torch.from_numpy(actions).float().unsqueeze(0).to(device) next_states = torch.from_numpy(next_states).float().unsqueeze(0).to( device) # Firs we pass a batch of next states to the actor so it tell us what actions # to execute, we use the actor target network instead of the actor local network # because of the advantage principle # set the target network to predict because this is not part of the training, this model # weights are alter by a soft update not by an optimizer self.actor_target.eval() with torch.no_grad(): next_state_actions = self.actor_target(next_states).detach() self.actor_target.train() # The critic evaluates the actions taking by the actor in the next state and generates the # Q(a,s) value of the next state taking those actions. These action, next_state tuple comes from the # ReplayBuffer not from interacting with the environment. # Remember the Critic or q_value function inputs is states, actions # We calculate the q_targets of the next state. We will use this to calculate the current # state q_value using the bellman equation. # set the target network to predict because this is not part of the training, this model # weights are alter by a soft update not by an optimizer self.critic_target.eval() with torch.no_grad(): q_targets_next_state_action_values = self.critic_target( next_states, next_state_actions).detach() self.actor_target.train() # With the next state q_value that is a vector of action values Q(s,a) of a random selected # next_states from the replay buffer. We calculate the CURRENT state target Q(s,a). # using the TD one-step Sarsa equations and the q_target_next value we got from the critic_target net # We make terminal states target Q(s,a) 0 and Non terminal the Q_targtes value # This is done to train the critic_local model in a supervise learning fashion, this is the target values. q_targets = torch.from_numpy( rewards + self.gamma * q_targets_next_state_action_values.numpy() * (1 - dones)).float() # --- Optimize the local Critic Model ----# # Here we start the supervise training process of the critic_local network # we pass a bunch of states actions samples it produces the expected output # q_value of each action we passed. q_expected = self.critic_local(states, actions) # Clear grad buffer values in preparation. self.critic_optimizer.zero_grad() # loss function for the critic_local model mean square of the difference # between the q_expected value and the q_target value. critic_loss = F.smooth_l1_loss(q_expected, q_targets) critic_loss.backward(retain_graph=True) # gradient clipping torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) # optimize the critic_local model using the optimizer defined for the critic # In the init function of this class self.critic_optimizer.step() # --- Optimize the local Actor Model ---# # Get the actor actions using the experience buffer states actor_actions = self.actor_local(states) # Use as a loss the negative sum of the q_values produce by the optimized critic local model given the # action of the actor_local model obtain using the states of the sampled buffer. loss_actor = -1 * torch.sum( self.critic_local.forward(states, actor_actions)) # Set the model gradients to zero in preparation self.actor_optimizer.zero_grad() # Back propagate loss_actor.backward() # optimize the actor_local model using the optimizer defined for the actor # In the init function of this class self.actor_optimizer.step() # Soft-update target models self.soft_update(self.critic_local, self.critic_target) self.soft_update(self.actor_local, self.actor_target) def soft_update(self, local_model, target_model): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data) def get_episode_score(self): """ Calculate the episode scores :return: None """ # Update score and best score self.score = self.total_reward / float( self.count) if self.count else 0.0 if self.score > self.best: self.best = self.score def save_model_weights(self): torch.save(self.actor_local.state_dict(), './checkpoints.pkl')
class ActorCritic(Model): def __init__(self, observation_space_size, action_space_size, name=None, env_name=None, model_config=None, play_mode=False): if name is None: name = "Unnamed-ActorCritic" super(ActorCritic, self).__init__(observation_space_size, action_space_size, name, env_name, model_config, play_mode) def build_model(self): self.policy_net = Actor(self.observation_space_size, self.action_space_size) self.critic_net = Critic(self.observation_space_size) if self.model_config is None: self.gamma = 0.99 self.actor_optimizer = optim.Adam(self.policy_net.parameters()) self.actor_loss = nn.MSELoss() self.critic_optimizer = optim.Adam(self.critic_net.parameters()) self.critic_loss = nn.MSELoss() self.get_epsilon = self.get_epsilon_default else: pass def save_checkpoint(self, n=0, filepath=None): """ n - number of epoch / episode or whatever is used for enumeration """ # TO DO: ADD OTHER RELEVANT PARAMETERS checkpoint = { 'policy': self.policy_net.state_dict(), 'critic': self.critic_net.state_dict(), 'optimizer': self.optimizer.state_dict() } super(ActorCritic, self).save_checkpoint(n, filepath, checkpoint) def load_checkpoint(self, filepath): # TO DO: ADD OTHER RELEVANT parameters checkpoint = torch.load(filepath) self.policy_net.load_state_dict(checkpoint['policy']) self.critic_net.load_state_dict(checkpoint['critic']) self.optimizer.load_state_dict(checkpoint['optimizer']) def prepare_sample(self, sample): sample = np.array(sample) states = torch.tensor(sample[:, 0], dtype=torch.float32) actions = torch.tensor(sample[:, 1], dtype=torch.float32) rewards = torch.tensor(sample[:, 2], dtype=torch.float32) next_states = torch.tensor(sample[:, 3], dtype=torch.float32) dones = torch.tensor(sample[:, 4], dtype=torch.int32) return states, actions, rewards, next_states, dones def critic_update(self, V, V_target): self.critic_optimizer.zero_grad() critic_loss = self.critic_loss(V, V_target) critic_loss.backward() self.critic_optimizer.step() return critic_loss.item() def actor_update(self, advantages, actions, mus): self.actor_optimizer.zero_grad() actor_loss = self.actor_loss(actions, mus) gradient_term = advantages * actor_loss gradient_term.backward() self.actor_optimizer.step() return actor_loss.item() def update(self, sample, prepare_state=None): actor_running_loss = [] critic_running_loss = [] for state, action, reward, next_state, done in sample: if prepare_state is not None: state = prepare_state(state) next_state = prepare_state(next_state) state = torch.tensor(state, dtype=torch.float32) next_state = torch.tensor(next_state, dtype=torch.float32) action = torch.tensor(action, dtype=torch.float32) # Update Critic V = self.critic_net.forward(state) V_target = torch.tensor([reward], dtype=torch.float32) if done is False: V_target += self.gamma * self.critic_net.forward(next_state) critic_loss = self.critic_update(V, V_target) critic_running_loss.append(critic_loss) # Update Actor advantage = (V_target - V).detach() mu = self.policy_net(state) actor_loss = self.actor_update(advantage, action, mu) actor_running_loss.append(actor_loss) return actor_running_loss, critic_running_loss def batch_update(self, sample, prepare_state=None): actor_running_loss = [] critic_running_loss = [] states, actions, rewards, next_states, dones = self.prepare_sample( sample) # Update Critic V = self.critic_net.forward(states) V_target = rewards + self.gamma * self.critic_net.forward( next_states) * (1 - dones) critic_loss = self.critic_update(V, V_target) critic_running_loss.append(critic_loss) # Update Actor advantage = (V_target - V).detach() mu = self.policy_net(states) actor_loss = self.actor_update(advantage, actions, mu) actor_running_loss.append(actor_loss) return actor_running_loss, critic_running_loss
transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.5, ), (0.5, )), ]) dataloader = DataLoader(MNIST('.', download=False, transform=transform), batch_size=batch_size, shuffle=True) generator_input_dim, discriminator_im_chan = get_input_dimensions( z_dim, mnist_shape, n_classes) gen = Generator(z_dim=generator_input_dim).to(device) gen_opt = torch.optim.Adam(gen.parameters(), lr=lr) disc = Critic(im_chan=discriminator_im_chan).to(device) disc_opt = torch.optim.Adam(disc.parameters(), lr=lr) cur_step = 0 generator_losses = [] discriminator_losses = [] noise_and_labels = False fake = False fake_image_and_labels = False real_image_and_labels = False disc_fake_pred = False disc_real_pred = False for epoch in range(n_epochs): # Dataloader returns the batches and the labels
class DDPG(): """DDPG agent""" def __init__(self, state_size, action_size, params, seed): """Initialize a DDPG agent Params ====== state_size (int): dimension of each state action_size (int): dimension of each action params (Params): hyperparameters seed (int): random seed """ self.gamma = params.gamma self.tau = params.tau self.seed = np.random.seed(seed) # actor networks self.actor_local = Actor(state_size, action_size, params.units_actor, seed).to(device) self.actor_target = Actor(state_size, action_size, params.units_actor, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), params.lr_actor) # critic newtworks self.critic_local = Critic(state_size, action_size, params.units_critic, seed).to(device) self.critic_target = Critic(state_size, action_size, params.units_critic, seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), params.lr_critic) # Noise process self.noise = OUNoise(action_size, seed, params.mu, params.theta, params.sigma) def noise_reset(self): self.noise.reset() def act(self, state): """Returns actions for given state(s) as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).data.cpu().numpy() self.actor_local.train() action += self.noise.sample() return np.clip(action, -1, 1) def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s') tuples """ states, actions, rewards, next_states, dones = experiences #### Update critic # Get predicted next-state actions from actor_target model next_actions = self.actor_target(next_states) # Get predicted next-state Q-Values from critic_target model next_q_targets = self.critic_target(next_states, next_actions) # Compute Q targets for current states Q_targets = rewards + self.gamma * next_q_targets * (1.0 - dones) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize critic loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() ### Update actor # Compute actor loss predicted_actions = self.actor_local(states) actor_loss = -self.critic_local(states, predicted_actions).mean() # Minimize actor loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() ### Update target networks self.soft_update(self.critic_local, self.critic_target) self.soft_update(self.actor_local, self.actor_target) def soft_update(self, local_model, target_model): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)
class TD3(object): """Agent class that handles the training of the networks and provides outputs as actions. """ def __init__(self): state_dim = cons.STATE_DIM.flatten().shape[0] action_dim = cons.ACTION_DIM self.actor = Actor(state_dim, action_dim, cons.MAX_ACTION).to(cons.DEVICE) self.actor_target = Actor(state_dim, action_dim, cons.MAX_ACTION).to(cons.DEVICE) self.actor_target.load_state_dict(self.actor.state_dict()) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=3e-4) # or 1e-3 self.critic = Critic(state_dim, action_dim).to(cons.DEVICE) self.critic_target = Critic(state_dim, action_dim).to(cons.DEVICE) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=3e-4) # or 1e-3 self.total_it = 0 self.critic_loss_plot = [] self.actor_loss_plot = [] def select_action(self, state, noise=cons.POLICY_NOISE): """Select an appropriate action from the agent policy Args: state (array): current state of environment noise (float): how much noise to add to actions Returns: action (list): nn action results """ state = torch.FloatTensor(state).to(cons.DEVICE) action = self.actor(state) # action space noise introduces noise to change the likelihoods of each action the agent might take if noise != 0: # creates tensor of gaussian noise noise = torch.clamp(torch.randn(14, dtype=torch.float32, device='cuda') * noise, min=-cons.NOISE_CLIP, max=cons.NOISE_CLIP) action = action + noise torch.clamp(action, min=cons.MIN_ACTION, max=cons.MAX_ACTION) return action def train(self, replay_buffer, iterations): """Train and update actor and critic networks Args: replay_buffer (ReplayBuffer): buffer for experience replay iterations (int): how many times to run training Return: actor_loss (float): loss from actor network critic_loss (float): loss from critic network """ for it in range(iterations): self.total_it += 1 # keep track of the total training iterations # Sample replay buffer (priority replay) # choose type of replay if cons.PRIORITY: state, action, reward, next_state, done, weights, indexes = replay_buffer.sample(cons.BATCH_SIZE, beta=cons.BETA_SCHED.value(it)) else: state, action, reward, next_state, done = replay_buffer.sample(cons.BATCH_SIZE) state = torch.from_numpy(state).float().to(cons.DEVICE) # torch.Size([100, 14]) next_state = torch.from_numpy(next_state).float().to(cons.DEVICE) # torch.Size([100, 14]) action = torch.from_numpy(action).float().to(cons.DEVICE) # torch.Size([100, 14]) reward = torch.as_tensor(reward, dtype=torch.float32).to(cons.DEVICE) # torch.Size([100]) done = torch.as_tensor(done, dtype=torch.float32).to(cons.DEVICE) # torch.Size([100]) with torch.no_grad(): # select an action according to the policy and add clipped noise next_action = self.actor_target(next_state) noise = torch.clamp(torch.randn((100, 14), dtype=torch.float32, device='cuda') * cons.POLICY_NOISE, min=-cons.NOISE_CLIP, max=cons.NOISE_CLIP) next_action = torch.clamp((next_action + noise), min=cons.MIN_ACTION, max=cons.MAX_ACTION) # Compute the target Q value target_q1, target_q2 = self.critic(state.float(), next_action.float()) target_q = torch.min(target_q1, target_q2) gamma = torch.ones((100, 1), dtype=torch.float32, device='cuda') gamma = gamma.new_full((100, 1), cons.GAMMA) target_q = reward.unsqueeze(1) + (done.unsqueeze(1) * gamma * target_q).detach() # get current Q estimates current_q1, current_q2 = self.critic(state.float(), action.float()) # compute critic loss critic_loss = F.mse_loss(current_q1, target_q) + F.mse_loss(current_q2, target_q) cons.TD3_REPORT.write_critic_loss(self.total_it, it, critic_loss) # optimize the critic self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # using the minimum of the q values as the weight, use min to prevent overestimation if cons.PRIORITY: new_priorities = torch.flatten(torch.min(current_q1, current_q2)) # convert any negative priorities to a minimum value, can't have a negative priority new_priorities = torch.clamp(new_priorities, min=0.0000001).tolist() # convert to a list for storage replay_buffer.update_priorities(indexes, new_priorities) # delayed policy updates if it % cons.POLICY_FREQ == 0: # update the actor policy less frequently # compute the actor loss q_action = self.actor(state).float().detach() actor_loss = -self.critic.get_q(state, q_action).mean() cons.TD3_REPORT.write_actor_loss(self.total_it, it, actor_loss, 1) # optimize the actor self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() self.actor_loss_plot.append(actor_loss.item()) # Update the frozen right_target models for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(cons.TAU * param.data + (1 - cons.TAU) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(cons.TAU * param.data + (1 - cons.TAU) * target_param.data) def save(self, filename, directory): torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename)) torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename)) def load(self, filename="best_avg", directory="td3/saves/shared_agent"): self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename))) self.critic.load_state_dict(torch.load('%s/%s_critic.pth' % (directory, filename)))
# training config MAX_EPISODE = 450 Actor_lr = 1e-3 Critic_lr = 1e-3 # problem setting grid = Grid() grid.draw_board() state_dim = 2 action_dim = 4 # init models actor = Actor(input_dim=state_dim, output_dim=action_dim) critic = Critic(input_dim=state_dim) actor_opt = AdamW(actor.parameters(), lr=Actor_lr) critic_opt = AdamW(critic.parameters(), lr=Critic_lr) # init loss a_loss = Actor_Loss() c_loss = Critic_Loss() for i_episode in range(MAX_EPISODE): s = grid.reset() t = 0 total_action = [] done = False while(not done and t < 200): # step 1 s = torch.Tensor(s) pai = actor(s[None, :]) # step 2
class AgentDDPG: def __init__(self, state_size, action_size, seed): """ :state_size: size of the state vector :action_size: size of the action vector """ self.state_size = state_size self.action_size = action_size self.t_step = 0 self.score = 0.0 self.best = 0.0 self.seed = seed self.learning_rate_actor = 0.0001 self.learning_rate_critic = 0.001 # Instances of the policy function or actor and the value function or critic # Actor critic with Advantage # Actor local and target network definitions self.actor_local = Actor(self.state_size, self.action_size, self.seed).to(device) self.actor_target = Actor(self.state_size, self.action_size, self.seed).to(device) # Critic local and target self.critic_local = Critic(self.state_size, self.action_size, self.seed).to(device) self.critic_target = Critic(self.state_size, self.action_size, self.seed).to(device) # Actor Optimizer self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.learning_rate_actor) # Critic Optimizer self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.learning_rate_critic) # Make sure local and target start with the same weights self.actor_target.load_state_dict(self.actor_local.state_dict()) self.critic_target.load_state_dict(self.critic_local.state_dict()) # Parameters for the Algorithm self.gamma = 0.99 # Discount factor self.tau = 0.001 # Soft update for target parameters Actor Critic with Advantage # Actor determines what to do based on the policy def act_local(self, state): # Given a state return the action recommended by the policy actor_local # Reshape the state to fit the torch tensor input state = torch.from_numpy(state).float().unsqueeze(0).to(device) # Pass the state to the actor local model to get an action # recommend for the policy in a state # set the actor_local model to predict not to train self.actor_local.eval() # set the model so this operation is not counted in the # gradiant calculation. with torch.no_grad(): actions = self.actor_local(state) # set the model back to training mode self.actor_local.train() # Return actions tensor return actions.detach() def act_target(self, states): # Pass the state to the actor target model to get an action # recommend for the policy in a state # set the actor_target model to predict not to train self.actor_target.eval() # set the model so this operation is not counted in the # gradiant calculation. with torch.no_grad(): actions = self.actor_target(states) # set the model back to training mode self.actor_target.train() # Return actions tensor return actions.detach() def get_episode_score(self): """ Calculate the episode scores :return: None """ # Update score and best score self.score = self.total_reward / float( self.count) if self.count else 0.0 if self.score > self.best: self.best = self.score def save_model_weights(self): torch.save(self.actor_local.state_dict(), './checkpoints.pkl')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') elif args.cuda == 'cpu': device = torch.device('cpu') else: device = torch.device('cuda:{}'.format(args.cuda)) print('Use {}'.format(device)) time.sleep(0.5) actor = Actor(env, actor_params) critic = Critic(critic_params, q=True) # Q-function actor.to(device) critic.to(device) actor_optim = torch.optim.Adam(actor.parameters(), lr=actor.params['learning_rate']) critic_optim = torch.optim.Adam(critic.parameters(), lr=critic.params['learning_rate']) # create the actor-critic agent agent_params.update(params) memory = ReplayBuffer(CAPACITY) agent = Agent(env, actor, critic, memory, actor_optim, critic_optim, agent_params, device) # Initialize or load the model if not LOAD: agent.actor._initialize() agent.critic._initialize() else: agent.load(LOAD)
class MAMLFewShotClassifier(nn.Module): def __init__(self, im_shape, device, args): """ Initializes a MAML few shot learning system :param im_shape: The images input size, in batch, c, h, w shape :param device: The device to use to use the model on. :param args: A namedtuple of arguments specifying various hyperparameters. """ super(MAMLFewShotClassifier, self).__init__() self.args = args self.device = device self.batch_size = args.batch_size self.use_cuda = args.use_cuda self.im_shape = im_shape self.current_epoch = 0 self.rng = set_torch_seed(seed=args.seed) if args.high_end: self.embedding = HighEndEmbedding(device, args, 3).to(device) self.classifier = HighEndClassifier( device, args, self.embedding.n_out_channels).to(device) else: self.classifier = VGGReLUNormNetwork( im_shape=self.im_shape, num_output_classes=self.args.num_classes_per_set, args=args, device=device, meta_classifier=True).to(device=self.device) self.task_learning_rate = args.task_learning_rate self.inner_loop_optimizer = LSLRGradientDescentLearningRule( device=device, init_learning_rate=self.task_learning_rate, total_num_inner_loop_steps=self.args. number_of_training_steps_per_iter + self.args.num_critic_updates, use_learnable_learning_rates=self.args. learnable_per_layer_per_step_inner_loop_learning_rate) self.inner_loop_optimizer.initialise( names_weights_dict=self.get_inner_loop_parameter_dict( params=self.classifier.named_parameters())) print("Inner Loop parameters") for key, value in self.inner_loop_optimizer.named_parameters(): print(key, value.shape) if args.use_critic: print( sum([ reduce(mul, p.size(), 1) for p in list( self.get_inner_loop_parameter_dict( self.classifier.named_parameters()).values()) ])) self.critic = Critic(n_theta=sum([ reduce(mul, p.size(), 1) for p in list( self.get_inner_loop_parameter_dict( self.classifier.named_parameters()).values()) ])) self.use_cuda = args.use_cuda self.device = device self.args = args self.to(device) print("Outer Loop parameters") for name, param in self.named_parameters(): if param.requires_grad: print(name, param.shape, param.device, param.requires_grad) if args.high_end: self.optimizer = optim.SGD(self.trainable_parameters(), lr=1e-4) else: self.optimizer = optim.Adam(self.trainable_parameters(), lr=args.meta_learning_rate, amsgrad=False) if args.use_critic: self.critic_optimizer = optim.SGD(self.critic.parameters(), lr=1e-6) self.scheduler = optim.lr_scheduler.CosineAnnealingLR( optimizer=self.optimizer, T_max=self.args.total_epochs, eta_min=self.args.min_learning_rate) def get_per_step_loss_importance_vector(self): """ Generates a tensor of dimensionality (num_inner_loop_steps) indicating the importance of each step's target loss towards the optimization loss. :return: A tensor to be used to compute the weighted average of the loss, useful for the MSL (Multi Step Loss) mechanism. """ loss_weights = np.ones( shape=(self.args.number_of_training_steps_per_iter)) * ( 1.0 / self.args.number_of_training_steps_per_iter) decay_rate = 1.0 / self.args.number_of_training_steps_per_iter / self.args.multi_step_loss_num_epochs min_value_for_non_final_losses = 0.03 / self.args.number_of_training_steps_per_iter for i in range(len(loss_weights) - 1): curr_value = np.maximum( loss_weights[i] - (self.current_epoch * decay_rate), min_value_for_non_final_losses) loss_weights[i] = curr_value curr_value = np.minimum( loss_weights[-1] + (self.current_epoch * (self.args.number_of_training_steps_per_iter - 1) * decay_rate), 1.0 - ((self.args.number_of_training_steps_per_iter - 1) * min_value_for_non_final_losses)) loss_weights[-1] = curr_value loss_weights = torch.Tensor(loss_weights).to(device=self.device) return loss_weights def get_inner_loop_parameter_dict(self, params): """ Returns a dictionary with the parameters to use for inner loop updates. :param params: A dictionary of the network's parameters. :return: A dictionary of the parameters to use for the inner loop optimization process. """ param_dict = dict() for name, param in params: if param.requires_grad: if self.args.enable_inner_loop_optimizable_bn_params: param_dict[name] = param.to(device=self.device) else: if "norm_layer" not in name: param_dict[name] = param.to(device=self.device) return param_dict def apply_inner_loop_update(self, loss, names_weights_copy, use_second_order, current_step_idx): """ Applies an inner loop update given current step's loss, the weights to update, a flag indicating whether to use second order derivatives and the current step's index. :param loss: Current step's loss with respect to the support set. :param names_weights_copy: A dictionary with names to parameters to update. :param use_second_order: A boolean flag of whether to use second order derivatives. :param current_step_idx: Current step's index. :return: A dictionary with the updated weights (name, param) """ self.classifier.zero_grad(names_weights_copy) grads = torch.autograd.grad(loss, names_weights_copy.values(), create_graph=use_second_order) names_grads_wrt_params = dict(zip(names_weights_copy.keys(), grads)) names_weights_copy = self.inner_loop_optimizer.update_params( names_weights_dict=names_weights_copy, names_grads_wrt_params_dict=names_grads_wrt_params, num_step=current_step_idx) return names_weights_copy def get_across_task_loss_metrics(self, total_losses, total_accuracies): losses = dict() losses['loss'] = torch.mean(torch.stack(total_losses)) losses['accuracy'] = np.mean(total_accuracies) return losses def forward(self, data_batch, epoch, use_second_order, use_multi_step_loss_optimization, num_steps, training_phase): """ Runs a forward outer loop pass on the batch of tasks using the MAML/++ framework. :param data_batch: A data batch containing the support and target sets. :param epoch: Current epoch's index :param use_second_order: A boolean saying whether to use second order derivatives. :param use_multi_step_loss_optimization: Whether to optimize on the outer loop using just the last step's target loss (True) or whether to use multi step loss which improves the stability of the system (False) :param num_steps: Number of inner loop steps. :param training_phase: Whether this is a training phase (True) or an evaluation phase (False) :return: A dictionary with the collected losses of the current outer forward propagation. """ x_support_set, x_target_set, y_support_set, y_target_set = data_batch [b, ncs, spc] = y_support_set.shape self.num_classes_per_set = ncs total_losses = [] total_accuracies = [] per_task_target_preds = [[] for i in range(len(x_target_set))] if self.args.high_end: self.embedding.zero_grad() self.classifier.zero_grad() for task_id, (x_support_set_task, y_support_set_task, x_target_set_task, y_target_set_task) in \ enumerate(zip(x_support_set, y_support_set, x_target_set, y_target_set)): task_losses = [] task_accuracies = [] per_step_loss_importance_vectors = self.get_per_step_loss_importance_vector( ) # this is theta_0 names_weights_copy = self.get_inner_loop_parameter_dict( self.classifier.named_parameters()) n, s, c, h, w = x_target_set_task.shape x_support_set_task = x_support_set_task.view(-1, c, h, w) y_support_set_task = y_support_set_task.view(-1) x_target_set_task = x_target_set_task.view(-1, c, h, w) y_target_set_task = y_target_set_task.view(-1) # Inner loop starts if self.args.high_end: x_support_set_task = self.embedding(x_support_set_task, 0, training=training_phase) x_target_set_task = self.embedding(x_target_set_task, 0, training=training_phase) for num_step in range(num_steps): # operates on the support set support_loss, support_preds = self.net_forward( x=x_support_set_task, y=y_support_set_task, weights=names_weights_copy, backup_running_statistics=True if (num_step == 0) else False, training=True, num_step=num_step) # this is update of theta from the copy of current theta_0 and onward # i.e. inner loop optimization wrt support set names_weights_copy = self.apply_inner_loop_update( loss=support_loss, names_weights_copy=names_weights_copy, use_second_order=use_second_order, current_step_idx=num_step) # TODO: inner loop OPTIMIZATION wrt target set??? if use_multi_step_loss_optimization and training_phase and epoch < self.args.multi_step_loss_num_epochs: # this is MAML++ way target_loss, target_preds = self.net_forward( x=x_target_set_task, y=y_target_set_task, weights=names_weights_copy, backup_running_statistics=False, training=True, num_step=num_step) task_losses.append( per_step_loss_importance_vectors[num_step] * target_loss) else: if num_step == ( self.args.number_of_training_steps_per_iter - 1): target_loss, target_preds = self.net_forward( x=x_target_set_task, y=y_target_set_task, weights=names_weights_copy, backup_running_statistics=False, training=True, num_step=num_step) task_losses.append(target_loss) if self.args.use_critic: for i in range(self.args.num_critic_updates): # TODO: here must be an update using the Critic (start without g) # F = {f(x^b_T, θ_{N+j}), θ_{N+j}, g(xS, xn)} # θ_{N+j+1} = θ_{N+j} − \gamma * \nabla_{θ_{N+j}} C(F,W) critic_loss, target_preds = self.net_forward_critic( x=x_target_set_task, y=y_target_set_task, weights=names_weights_copy, backup_running_statistics=False, training=True, num_step=num_step + i) names_weights_copy = self.apply_inner_loop_update( loss=critic_loss, names_weights_copy=names_weights_copy, use_second_order=use_second_order, current_step_idx=num_step + i) target_loss, target_preds = self.net_forward( x=x_target_set_task, y=y_target_set_task, weights=names_weights_copy, backup_running_statistics=False, training=True, num_step=num_step) task_losses.append(target_loss) per_task_target_preds[task_id] = target_preds.detach().cpu().numpy( ) _, predicted = torch.max(target_preds.data, 1) accuracy = predicted.float().eq( y_target_set_task.data.float()).cpu().float() task_losses = torch.sum(torch.stack(task_losses)) total_losses.append(task_losses) total_accuracies.extend(accuracy) if not training_phase: self.classifier.restore_backup_stats() losses = self.get_across_task_loss_metrics( total_losses=total_losses, total_accuracies=total_accuracies) for idx, item in enumerate(per_step_loss_importance_vectors): losses['loss_importance_vector_{}'.format( idx)] = item.detach().cpu().numpy() return losses, per_task_target_preds def net_forward(self, x, y, weights, backup_running_statistics, training, num_step): """ A base model forward pass on some data points x. Using the parameters in the weights dictionary. Also requires boolean flags indicating whether to reset the running statistics at the end of the run (if at evaluation phase). A flag indicating whether this is the training session and an int indicating the current step's number in the inner loop. :param x: A data batch of shape b, c, h, w :param y: A data targets batch of shape b, n_classes :param weights: A dictionary containing the weights to pass to the network. :param backup_running_statistics: A flag indicating whether to reset the batch norm running statistics to their previous values after the run (only for evaluation) :param training: A flag indicating whether the current process phase is a training or evaluation. :param num_step: An integer indicating the number of the step in the inner loop. :return: the crossentropy losses with respect to the given y, the predictions of the base model. """ preds = self.classifier.forward( x=x, params=weights, training=training, backup_running_statistics=backup_running_statistics, num_step=num_step) loss = F.cross_entropy(input=preds, target=y) return loss, preds def net_forward_critic(self, x, y, weights, backup_running_statistics, training, num_step): """ A base model forward pass on some data points x. Using the parameters in the weights dictionary. Also requires boolean flags indicating whether to reset the running statistics at the end of the run (if at evaluation phase). A flag indicating whether this is the training session and an int indicating the current step's number in the inner loop. :param x: A data batch of shape b, c, h, w :param y: A data targets batch of shape b, n_classes :param weights: A dictionary containing the weights to pass to the network. :param backup_running_statistics: A flag indicating whether to reset the batch norm running statistics to their previous values after the run (only for evaluation) :param training: A flag indicating whether the current process phase is a training or evaluation. :param num_step: An integer indicating the number of the step in the inner loop. :return: the crossentropy losses with respect to the given y, the predictions of the base model. """ preds = self.classifier.forward( x=x, params=weights, training=training, backup_running_statistics=backup_running_statistics, num_step=num_step) print(weights.keys()) params1d = torch.cat( [torch.reshape(p, (1, -1)) for p in list(weights.values())], dim=1) print(params1d.shape) loss = self.critic(preds, params1d) return loss, preds def trainable_parameters(self): """ Returns an iterator over the trainable parameters of the model. """ for param in self.parameters(): if param.requires_grad: yield param def train_forward_prop(self, data_batch, epoch): """ Runs an outer loop forward prop using the meta-model and base-model. :param data_batch: A data batch containing the support set and the target set input, output pairs. :param epoch: The index of the currrent epoch. :return: A dictionary of losses for the current step. """ losses, per_task_target_preds = self.forward( data_batch=data_batch, epoch=epoch, use_second_order=self.args.second_order and epoch > self.args.first_order_to_second_order_epoch, use_multi_step_loss_optimization=self.args. use_multi_step_loss_optimization, num_steps=self.args.number_of_training_steps_per_iter, training_phase=True) return losses, per_task_target_preds def evaluation_forward_prop(self, data_batch, epoch): """ Runs an outer loop evaluation forward prop using the meta-model and base-model. :param data_batch: A data batch containing the support set and the target set input, output pairs. :param epoch: The index of the currrent epoch. :return: A dictionary of losses for the current step. """ losses, per_task_target_preds = self.forward( data_batch=data_batch, epoch=epoch, use_second_order=False, use_multi_step_loss_optimization=True, num_steps=self.args.number_of_evaluation_steps_per_iter, training_phase=False) return losses, per_task_target_preds def meta_update(self, loss): """ Applies an outer loop update on the meta-parameters of the model. :param loss: The current crossentropy loss. """ self.optimizer.zero_grad() loss.backward(retain_graph=True) if 'imagenet' in self.args.dataset_name: for name, param in self.classifier.named_parameters(): if param.requires_grad: param.grad.data.clamp_( -10, 10 ) # not sure if this is necessary, more experiments are needed if self.args.high_end: for name, param in self.embedding.named_parameters(): if param.requires_grad: param.grad.data.clamp_( -10, 10 ) # not sure if this is necessary, more experiments are needed self.optimizer.step() def critic_meta_update(self, loss): self.critic_optimizer.zero_grad() loss.backward() self.critic_optimizer.step() def run_train_iter(self, data_batch, epoch): """ Runs an outer loop update step on the meta-model's parameters. :param data_batch: input data batch containing the support set and target set input, output pairs :param epoch: the index of the current epoch :return: The losses of the ran iteration. """ epoch = int(epoch) self.scheduler.step(epoch=epoch) if self.current_epoch != epoch: self.current_epoch = epoch if not self.training: self.train() x_support_set, x_target_set, y_support_set, y_target_set = data_batch x_support_set = torch.Tensor(x_support_set).float().to( device=self.device) x_target_set = torch.Tensor(x_target_set).float().to( device=self.device) y_support_set = torch.Tensor(y_support_set).long().to( device=self.device) y_target_set = torch.Tensor(y_target_set).long().to(device=self.device) data_batch = (x_support_set, x_target_set, y_support_set, y_target_set) losses, per_task_target_preds = self.train_forward_prop( data_batch=data_batch, epoch=epoch) self.meta_update(loss=losses['loss']) if self.args.use_critic: self.critic_meta_update(loss=losses['loss']) losses['learning_rate'] = self.scheduler.get_lr()[0] self.optimizer.zero_grad() self.zero_grad() return losses, per_task_target_preds def run_validation_iter(self, data_batch): """ Runs an outer loop evaluation step on the meta-model's parameters. :param data_batch: input data batch containing the support set and target set input, output pairs :param epoch: the index of the current epoch :return: The losses of the ran iteration. """ if self.training: self.eval() x_support_set, x_target_set, y_support_set, y_target_set = data_batch x_support_set = torch.Tensor(x_support_set).float().to( device=self.device) x_target_set = torch.Tensor(x_target_set).float().to( device=self.device) y_support_set = torch.Tensor(y_support_set).long().to( device=self.device) y_target_set = torch.Tensor(y_target_set).long().to(device=self.device) data_batch = (x_support_set, x_target_set, y_support_set, y_target_set) losses, per_task_target_preds = self.evaluation_forward_prop( data_batch=data_batch, epoch=self.current_epoch) # losses['loss'].backward() # uncomment if you get the weird memory error # self.zero_grad() # self.optimizer.zero_grad() return losses, per_task_target_preds def save_model(self, model_save_dir, state): """ Save the network parameter state and experiment state dictionary. :param model_save_dir: The directory to store the state at. :param state: The state containing the experiment state and the network. It's in the form of a dictionary object. """ state['network'] = self.state_dict() torch.save(state, f=model_save_dir) def load_model(self, model_save_dir, model_name, model_idx): """ Load checkpoint and return the state dictionary containing the network state params and experiment state. :param model_save_dir: The directory from which to load the files. :param model_name: The model_name to be loaded from the direcotry. :param model_idx: The index of the model (i.e. epoch number or 'latest' for the latest saved model of the current experiment) :return: A dictionary containing the experiment state and the saved model parameters. """ filepath = os.path.join(model_save_dir, "{}_{}".format(model_name, model_idx)) state = torch.load(filepath) state_dict_loaded = state['network'] self.load_state_dict(state_dict=state_dict_loaded) return state
class TD3(object): """Agent class that handles the training of the networks and provides outputs as actions. Args: state_dim (array): state size action_dim (array): action size policy_noise (float): how much noise to add to actions device (device): cuda or cpu to process the tensors discount (float): discount factor tau (float): soft update for main networks to target networks policy_noise (float): noise factor noise_clip (float): clip factor policy_freq (int): frequency of policy updates """ def __init__(self, state_dim, action_dim, max_action, discount, tau, policy_noise, noise_clip, policy_freq, device): self.state_dim = len(state_dim[0]) self.action_dim = len(action_dim) self.max_action = max_action[2] self.actor = Actor(self.state_dim, self.action_dim, self.max_action).to(device) self.actor_target = copy.deepcopy(self.actor).float() # self.actor_target = Actor(state_dim, action_dim, self.max_action).to(device) # self.actor_target.load_state_dict(self.actor.state_dict()) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=3e-4) # or 1e-3 self.critic = Critic(self.state_dim, self.action_dim).to(device) self.critic_target = copy.deepcopy(self.critic).float() # self.critic_target = Critic(state_dim, action_dim).to(device) # self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=3e-4) # or 1e-2 self.device = device self.max_action = max_action self.discount = discount self.tau = tau self.policy_noise = policy_noise self.noise_clip = noise_clip self.policy_freq = policy_freq self.total_it = 0 def select_action(self, state): """Select an appropriate action from the agent policy Args: state (array): current state of environment Returns: action (float): action clipped within action range """ state = torch.FloatTensor(state.reshape(1, -1)).to(self.device) # if noise != 0: # action_dim = len(self.env.action_space()) # action = (action + np.random.normal(0, noise, size=action_dim)) # action_space_low, _, action_space_high = self.env.action_domain() # return action.clip(action_space_low, action_space_high) return self.actor(state).cpu().data.numpy().flatten() def train(self, replay_buffer, batch_size=100): """Train and update actor and critic networks Args: replay_buffer (ReplayBuffer): buffer for experience replay batch_size(int): batch size to sample from replay buffer Return: actor_loss (float): loss from actor network critic_loss (float): loss from critic network """ self.total_it += 1 # Sample replay buffer state, next_state, action, reward, done = replay_buffer.sample( batch_size) state = torch.from_numpy( np.asarray([np.array(i.item().values()) for i in state])) next_state = np.asarray( [np.array(i.item().values()) for i in next_state]) reward = torch.as_tensor(reward, dtype=torch.float32) done = torch.as_tensor(done, dtype=torch.float32) with torch.no_grad(): # select an action according to the policy an add clipped noise # need to select set of actions noise = (torch.rand_like(torch.from_numpy(action)) * self.policy_noise).clamp(-self.noise_clip, self.noise_clip) next_action = (self.actor_target( torch.tensor(next_state, dtype=torch.float32)) + torch.tensor(noise, dtype=torch.float32)).clamp( self.max_action[0], self.max_action[2]) # next_action_d =torch.as_tensor(next_action, dtype=torch.double) # Compute the target Q value target_Q1, target_Q2 = self.critic(state, next_action) target_Q = torch.min(target_Q1, target_Q2) target_Q = reward + done * self.discount * target_Q # update action datatype, can't do earlier, use np.array earlier action = torch.as_tensor(action, dtype=torch.float32) # get current Q estimates current_Q1, current_Q2 = self.critic(state, action) # compute critic loss critic_loss = F.mse_loss(current_Q1, target_Q[:1, :].transpose( 0, 1)) + F.mse_loss(current_Q2, target_Q[:1, :].transpose(0, 1)) # optimize the critic self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # delayed policy updates if self.total_it % self.policy_freq == 0: # compute the actor loss actor_loss = -self.critic.get_q(state, self.actor(state)).mean() # optimize the actor self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update the frozen target models for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) def save(self, filename, directory): torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename)) torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename)) def load(self, filename="best_avg", directory="./saves"): self.actor.load_state_dict( torch.load('%s/%s_actor.pth' % (directory, filename))) self.critic.load_state_dict( torch.load('%s/%s_critic.pth' % (directory, filename)))
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_agents, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) self.eps = EPS_START self.eps_decay = 1 / (EPS_EP_END * LEARN_NUM ) # set decay rate based on epsilon end target self.timestep = 0 # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise((num_agents, action_size), random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done, agent_number): """Save experience in replay memory, and use random sample from buffer to learn.""" self.timestep += 1 # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory and at learning interval settings if len(self.memory) > BATCH_SIZE and self.timestep % LEARN_EVERY == 0: for _ in range(LEARN_NUM): experiences = self.memory.sample() self.learn(experiences, GAMMA, agent_number) def act(self, states, add_noise): """Returns actions for both agents as per current policy, given their respective states.""" states = torch.from_numpy(states).float().to(device) actions = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): # get action for each agent and concatenate them for agent_num, state in enumerate(states): action = self.actor_local(state).cpu().data.numpy() actions[agent_num, :] = action self.actor_local.train() # add noise to actions if add_noise: actions += self.eps * self.noise.sample() actions = np.clip(actions, -1, 1) return actions def reset(self): self.noise.reset() def learn(self, experiences, gamma, agent_number): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) # Construct next actions vector relative to the agent if agent_number == 0: actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1) else: actions_next = torch.cat((actions[:, :2], actions_next), dim=1) # Compute Q targets for current states (y_i) Q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) # Construct action prediction vector relative to each agent if agent_number == 0: actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1) else: actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1) # Compute actor loss actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) # update noise decay parameter self.eps -= self.eps_decay self.eps = max(self.eps, EPS_FINAL) self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class TD3: def __init__(self, device, state_dim, action_dim, action_max, gamma=0.99, tau=0.005, lr=3e-4, policy_noise=0.2, noise_clip=0.5, exploration_noise=0.1, policy_freq=2): self.actor = Actor(state_dim, 256, action_dim, action_max).to(device) self.target_actor = copy.deepcopy(self.actor) self.actor_optimizer = optim.Adam(params=self.actor.parameters(), lr=lr) self.critic = Critic(state_dim, 256, action_dim).to(device) self.target_critic = copy.deepcopy(self.critic) self.critic_optimizer = optim.Adam(params=self.critic.parameters(), lr=lr) self.device = device self.gamma = gamma self.tau = tau self.policy_noise = policy_noise self.noise_clip = noise_clip self.policy_freq = policy_freq self.rollout_actor = TD3RolloutActor(state_dim, action_dim, action_max, exploration_noise) self.sync_rollout_actor() self.iteration_num = 0 def train(self, replay_buffer, batch_size=256): self.iteration_num += 1 st, nx_st, ac, rw, mask = replay_buffer.sample(batch_size) with torch.no_grad(): noise = (torch.randn_like(ac) * self.policy_noise).clamp( -self.noise_clip, self.noise_clip) nx_ac = self.target_actor.forward(nx_st, noise) target_q1, target_q2 = self.target_critic.forward(nx_st, nx_ac) min_q = torch.min(target_q1, target_q2) target_q = rw + mask * self.gamma * min_q q1, q2 = self.critic.forward(st, ac) critic_loss = F.mse_loss(q1, target_q) + F.mse_loss(q2, target_q) self.critic.zero_grad() critic_loss.backward() self.critic_optimizer.step() if self.iteration_num % self.policy_freq == 0: actor_loss = -self.critic.q1(st, self.actor.forward(st)).mean() self.actor.zero_grad() actor_loss.backward() self.actor_optimizer.step() for param, target_param in zip(self.critic.parameters(), self.target_critic.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.target_actor.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) self.sync_rollout_actor() def sync_rollout_actor(self): for param, target_param in zip(self.actor.parameters(), self.rollout_actor.parameters()): target_param.data.copy_(param.data.cpu()) def save(self, path): torch.save(self.critic.state_dict(), os.path.join(path, 'critic.pth')) torch.save(self.target_critic.state_dict(), os.path.join(path, 'target_critic.pth')) torch.save(self.critic_optimizer.state_dict(), os.path.join(path, 'critic_optimizer.pth')) torch.save(self.actor.state_dict(), os.path.join(path, 'actor.pth')) torch.save(self.target_actor.state_dict(), os.path.join(path, 'target_actor.pth')) torch.save(self.actor_optimizer.state_dict(), os.path.join(path, 'actor_optimizer.pth')) def load(self, path): self.critic.load_state_dict( torch.load(os.path.join(path, 'critic.pth'))) self.target_critic.load_state_dict( torch.load(os.path.join(path, 'target_critic.pth'))) self.critic_optimizer.load_state_dict( torch.load(os.path.join(path, 'critic_optimizer.pth'))) self.actor.load_state_dict(torch.load(os.path.join(path, 'actor.pth'))) self.target_actor.load_state_dict( torch.load(os.path.join(path, 'target_actor.pth'))) self.actor_optimizer.load_state_dict( torch.load(os.path.join(path, 'actor_optimizer.pth'))) self.sync_rollout_actor()
trainloader = data.DataLoader(train_data, shuffle=True) latent_distr = torch.distributions.normal.Normal(0, 1) uniform_distr = torch.distributions.uniform.Uniform(0, 1) # Networks crit = Critic() gen = Generator() rep = Representator() rep_gen = Representator() crit.cuda() gen.cuda() rep.cuda() rep_gen.cuda() optimizer = torch.optim.Adam(crit.parameters(), lr=0.0004, betas=(0.0, 0.9)) # TTUR optimizer_gen = torch.optim.Adam(gen.parameters(), lr=0.001, betas=(0.0, 0.9)) optimizer_rep = torch.optim.Adam(rep.parameters(), lr=0.0001, betas=(0.0, 0.9)) optimizer_rep_gen = torch.optim.Adam(rep_gen.parameters(), lr=0.001, betas=(0.0, 0.9)) # exponentially decaying learning rate scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.999) scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optimizer_gen, gamma=0.999) scheduler_r = torch.optim.lr_scheduler.ExponentialLR(optimizer_rep, gamma=0.999) scheduler_r_gen = torch.optim.lr_scheduler.ExponentialLR(optimizer_rep_gen, gamma=0.999)
class TD3(object): # Twin Delay DDGP def __init__(self, state_dim, action_dim, max_action): self.actor = Actor(state_dim, action_dim, max_action).to(device) self.actor_target = Actor(state_dim, action_dim, max_action).to(device) self.actor_target.load_state_dict(self.actor.state_dict()) self.actor_optimizer = torch.optim.Adam(self.actor.parameters()) self.critic = Critic(state_dim, action_dim).to(device) self.critic_target = Critic(state_dim, action_dim).to(device) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = torch.optim.Adam(self.critic.parameters()) self.max_action = max_action def select_action(self, state): state = torch.Tensor(state.reshape(1, -1)).to(device) return self.actor(state).cpu().data.numpy().flatten() def train(self, replay_buffer, iterations, batch_size=100, discount=0.99, tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2): for it in range(iterations): # step 4: we sample batch of transitions (s, s', a, r) from the memory batch_state, batch_next_state, batch_actions, batch_rewards, batch_dones = replay_buffer.sample( batch_size) state = torch.Tensor(batch_state).to(device) next_state = torch.Tensor(batch_next_state).to(device) action = torch.Tensor(batch_actions).to(device) reward = torch.Tensor(batch_rewards).to(device) done = torch.Tensor(batch_dones).to(device) # step 5: from the next state s', the actor traget plays next action a' next_action = self.actor_target(next_state) # step 6: we add Gaussian noise to the next action a' and we clamp it in a range of values supported by the environment noise = torch.Tensor(batch_actions).data.normal_( 0, policy_noise).to(device) noise = noise.clamp(-noise_clip, noise_clip) next_action = (next_action + noise).clamp(-self.max_action, self.max_action) # step 7: the two critic targets take each the couple (s', a') as input and return 2 Q-values Qt1(s', a') and Qt2(s', a') as output target_Q1, target_Q2 = self.critic_target(next_state, next_action) # step 8: we keep the minimum of these Q-values min(Qt1, Qt2) target_Q = torch.min(target_Q1, target_Q2) # step 9: we get the final target of the 2 critic models, which is : Qt = r + gamma * target_Q target_Q = reward + (1 - done) * discount * target_Q # step 10: the 2 critic models take each the couple (s, a) as input and return 2 Q-values Qt1(s, a) and Qt2(s, a) as outputs current_Q1, current_Q2 = self.critic(state, action) # step 11: we compute the less coming from the 2 critic models : critic loss = mse_loss(Q1(s,a), Qt) + mse_loss(Q2(s,a), Qt) critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss( current_Q2, target_Q) # step 12: we backpropagate the critic loss and update the parameters of the 2 critic models with SGD optimizer self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # step 13: one every 2 iterations, we update our Actor model by performing gradient ascent on the output of the first critic model if it % policy_freq == 0: # deterministic policy gradient DPG actor_loss = -self.critic.Q1(state, self.actor(state)).mean() self.actor.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Delay # step 14: still once every 2 iterations, we update the weights of the actor target by polyak averaging for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) # step 15: still ones every 2 iterations, we uodate the weights of the critic target by polyak averaging for param, target_param in zip( self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) def save(self, filename, directory): torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename)) torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename)) def load(self, filename, directory): self.actor.load_state_dict( torch.load('%s/%s_actor.pth' % (directory, filename))) self.critic.load_state_dict( torch.load('%s/%s_critic.pth' % (directory, filename)))
# Networks crit = Critic() gen = Generator(latent_size) classifier = models.vgg13(pretrained=False) # adjust final layer to handle 10 classes classifier.classifier._modules['6'] = torch.nn.Linear(4096, 10) classifier.train() crit.cuda() gen.cuda() classifier.cuda() adversarial_loss = torch.nn.BCELoss() neg_logl = torch.nn.NLLLoss() optimizer = torch.optim.Adam(crit.parameters(), lr=0.0001, betas=(0.5, 0.999)) optimizer_gen = torch.optim.Adam(gen.parameters(), lr=0.0001, betas=(0.5, 0.999)) optimizer_classifier = torch.optim.Adam(classifier.parameters(), lr=0.0001, betas=(0.5, 0.999)) scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.999) scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optimizer_gen, gamma=0.999) scheduler_c = torch.optim.lr_scheduler.ExponentialLR(optimizer_classifier, gamma=0.999) t = 0
device = 'cuda' transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,)), ]) dataloader = DataLoader( MNIST('.', download=True, transform=transform), batch_size=batch_size, shuffle=True) gen = Generator(z_dim).to(device) gen_opt = torch.optim.Adam(gen.parameters(), lr=lr, betas=(beta_1, beta_2)) crit = Critic().to(device) crit_opt = torch.optim.Adam(crit.parameters(), lr=lr, betas=(beta_1, beta_2)) cur_step = 0 generator_losses = [] critic_losses = [] for epoch in range(n_epochs): # Dataloader returns the batches for real, _ in tqdm(dataloader): cur_batch_size = len(real) real = real.to(device) mean_iteration_critic_loss = 0 for _ in range(crit_repeats): ### Update critic ### crit_opt.zero_grad() fake_noise = get_noise(cur_batch_size, z_dim, device=device)
class Agent: def __init__(self, env, alpha: float = 1e-3, gamma: float = 0.99, hidden_size: int = 32, tau: float = 1e-3): self.env = env self.gamma = gamma self.alpha = alpha self.tau = tau self.device = "cuda" if torch.cuda.is_available() else "cpu" self.actor = Actor(2, hidden_size, 1) self.actor_target = deepcopy(self.actor) self.critic = Critic(3, hidden_size, 1) self.critic_target = deepcopy(self.critic) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=alpha) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=alpha) self.critic.to(self.device) self.critic_target.to(self.device) self.actor.to(self.device) self.actor_target.to(self.device) def update_critic(self, batch): state, action, next_state, _, done = batch state, next_state, action = map( lambda item: torch.tensor(item).to(self.device).float(), (state, next_state, action)) done = torch.tensor(done).to(self.device) with torch.no_grad(): q_target = self.critic_target( next_state, self.actor_target(next_state)) # pred Q value for each action q_target[done] = 0 loss = F.mse_loss(self.critic(state, action), q_target) self.critic_optimizer.zero_grad() loss.backward() grad_clamp(self.critic) self.critic_optimizer.step() self.soft_update(self.critic, self.critic_target) def update_actor(self, batch): state, *_ = batch state = torch.tensor(state).to(self.device).float() loss = -torch.mean(self.critic(state, self.actor(state))) self.actor_optimizer.zero_grad() loss.backward() grad_clamp(self.actor) self.actor_optimizer.step() self.soft_update(self.actor, self.actor_target) def act(self, state): with torch.no_grad(): state_ = torch.tensor(state).to(self.device).float() return self.actor(state_).cpu().numpy() def reset(self): return self.env.reset() def train(self, transitions: int, sigma_max: float = 1., sigma_min: float = 0., buffer_size: int = 10000, batch_size: int = 128, progress_upd_step: int = None, start_training: int = 1000, shaping_coef: float = 300.): history = ReplayBuffer(buffer_size) progress_upd_step = progress_upd_step if progress_upd_step else transitions // 100 log = { "alpha": self.alpha, "gamma": self.gamma, "sigma_max": sigma_max, "sigma_min": sigma_min, "buffer_size": buffer_size, "batch_size": batch_size, "tau": self.tau, "shaping_coef": shaping_coef, "step": [], "reward_mean": [], "reward_std": [] } state = self.reset() t = tqdm(range(transitions)) for i in t: sigma = sigma_max - (sigma_max - sigma_min) * i / transitions action = self.act(state) noise = np.random.normal(scale=sigma, size=action.shape) action = np.clip(action + noise, -1, 1) next_state, reward, done, _ = self.env.step(action) reward += shaping_coef * (self.gamma * np.abs(next_state[1]) - np.abs(state[1])) done_ = next_state[0] >= 0.5 history.add((state, action, next_state, reward, done_)) state = self.reset() if done else next_state if i > start_training: batch = history.sample(batch_size) self.update_critic(batch) self.update_actor(batch) if (i + 1) % progress_upd_step == 0: reward_mean, reward_std = self.evaluate_policy() log["step"].append(i) log["reward_mean"].append(reward_mean) log["reward_std"].append(reward_std) t.set_description( f"step: {i + 1} | Rmean = {reward_mean:0.4f} | Rstd = {reward_std:0.4f}" ) return log def soft_update(self, model, target): with torch.no_grad(): for param, param_target in zip(model.parameters(), target.parameters()): param_target.data.mul_(1 - self.tau) param_target.data.add_(self.tau * param.data) def rollout(self, to_render: bool = False): done = False state = self.reset() total_reward = 0 while not done: state, reward, done, _ = self.env.step(self.act(state)) total_reward += reward if to_render: self.env.render() self.env.close() return total_reward def evaluate_policy(self, episodes: int = 5, to_render: bool = False): rewards = [] for _ in range(episodes): rewards.append(self.rollout(to_render=to_render)) return np.mean(rewards), np.std(rewards)
class DDPGAgent(): def __init__(self, seed, n_state, n_action, batch_size=64, buffer=1e5, gamma=0.99, lr_actor=1e-4, lr_critic=1e-3, weight_decay=0, tau=1e-3): self.batch_size = batch_size #init actor self.local_actor = Actor(n_state, n_action, seed).to(device) self.target_actor = Actor(n_state, n_action, seed).to(device) self.optim_actor = torch.optim.Adam(self.local_actor.parameters(), lr=lr_actor) #init critic self.local_critic = Critic(n_state, n_action, seed).to(device) self.target_critic = Critic(n_state, n_action, seed).to(device) self.optim_critic = torch.optim.Adam(self.local_critic.parameters(), lr=lr_critic, weight_decay=weight_decay) #init memory self.memory = memory(int(buffer), device, seed) self.tau = tau self.gamma = gamma self.noise = noise(n_action, seed=seed) def step(self, state, action, reward, next_state, done): event = Event(state, action, reward, next_state, done) self.memory.add(event) self.learn() def act(self, state): state = torch.from_numpy(state).float().to(device) self.local_actor.eval() with torch.no_grad(): action = self.local_actor(state).cpu().data.numpy() self.local_actor.train() action += self.noise.make() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self): """ Update both actor and critic networks """ event_batch = self.memory.sample(self.batch_size) if event_batch is None: return event_batch = self.memory.deserialize(event_batch) self.update_critic(event_batch) self.update_actor(event_batch) self.update_target(self.local_actor, self.target_actor) self.update_target(self.local_critic, self.target_critic) def update_critic(self, batch): ## TD step # t expected_Q = self.local_critic(batch.states, batch.actions) # t+1 actions_pred = self.target_actor(batch.states_next) target_Q_next = self.target_critic(batch.states_next, actions_pred) #only learning from positives? negatives are good source of learning too target_Q = batch.rewards + (self.gamma * target_Q_next * (1 - batch.dones)) loss = nn.functional.mse_loss(expected_Q, target_Q) self.optim_critic.zero_grad() loss.backward() self.optim_critic.step() def update_actor(self, batch): actions_predicted = self.local_actor(batch.states) #fixthis loss = -self.local_critic(batch.states, actions_predicted).mean() #rms self.optim_actor.zero_grad() loss.backward() self.optim_actor.step() def update_target(self, local, target): for target_param, local_param in zip(target.parameters(), local.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)
class Agent: def __init__(self, device, state_size, action_size, buffer_size=10, batch_size=10, actor_learning_rate=1e-4, critic_learning_rate=1e-3, discount_rate=0.99, tau=0.1, steps_per_update=4, action_range=None, dropout_p=0.0, weight_decay=0.0001, noise_max=0.2, noise_decay=1.0, n_agents=1 ): self.device: torch.device = device self.state_size = state_size self.action_size = action_size self.critic_control = Critic(state_size, action_size).to(device) self.critic_control.dropout.p = dropout_p self.critic_target = Critic(state_size, action_size).to(device) self.critic_target.eval() self.critic_optimizer = torch.optim.Adam( self.critic_control.parameters(), weight_decay=weight_decay, lr=critic_learning_rate) self.actor_control = Actor(state_size, action_size, action_range).to( device) self.actor_control.dropout.p = dropout_p self.actor_target = Actor(state_size, action_size, action_range).to( device) self.actor_target.eval() self.actor_optimizer = torch.optim.Adam( self.actor_control.parameters(), weight_decay=weight_decay, lr=actor_learning_rate) self.batch_size = batch_size self.min_buffer_size = batch_size self.replay_buffer = ReplayBuffer(device, state_size, action_size, buffer_size) self.discount_rate = discount_rate self.tau = tau self.step_count = 0 self.steps_per_update = steps_per_update self.noise_max = noise_max self.noise = OUNoise([n_agents, action_size], 15071988, sigma=self.noise_max) self.noise_decay = noise_decay self.last_score = float('-inf') def policy(self, state, add_noise=True): state = torch.from_numpy(state).float().to(self.device) self.actor_control.eval() with torch.no_grad(): action = self.actor_control(state).cpu().numpy() self.actor_control.train() if add_noise: noise = self.noise.sample() action += noise return action def step(self, state, action, reward, next_state, done): p = self.calculate_p(state, action, reward, next_state, done) for i in range(state.shape[0]): self.replay_buffer.add(state[i, :], action[i, :], reward[i], next_state[i, :], done[i], p[i]) if self.step_count % self.steps_per_update == 0: self.learn() self.step_count += 1 def learn(self): if len(self.replay_buffer) < self.min_buffer_size: return indicies, (states, actions, rewards, next_states, dones, p) = \ self.replay_buffer.sample(self.batch_size) self.actor_control.eval() error = self.bellman_eqn_error( states, actions, rewards, next_states, dones) self.actor_control.train() importance_scaling = (self.replay_buffer.buffer_size * p) ** -1 importance_scaling /= importance_scaling.max() self.critic_optimizer.zero_grad() loss = (importance_scaling * (error ** 2)).sum() / self.batch_size loss.backward() self.critic_optimizer.step() self.actor_optimizer.zero_grad() expected_actions = self.actor_control(states) critic_score = self.critic_control(states, expected_actions) loss = -1 * (importance_scaling * critic_score).sum() / self.batch_size loss.backward() self.actor_optimizer.step() self.update_target(self.critic_control, self.critic_target) self.update_target(self.actor_control, self.actor_target) self.replay_buffer.update(indicies, error.detach().abs().cpu() + 1e-3) def bellman_eqn_error(self, states, actions, rewards, next_states, dones): """Double DQN error - use the control network to get the best action and apply the target network to it to get the target reward which is used for the bellman eqn error. """ next_actions = self.actor_control(next_states) target_action_values = self.critic_target(next_states, next_actions) target_rewards = ( rewards + self.discount_rate * (1 - dones) * target_action_values ) current_rewards = self.critic_control(states, actions) error = current_rewards - target_rewards return error def calculate_p(self, state, action, reward, next_state, done): next_state = torch.from_numpy(next_state).float().to( self.device) state = torch.from_numpy(state).float().to(self.device) action = torch.from_numpy(action).float().to(self.device) reward = torch.from_numpy(reward).float().to(self.device) done = torch.from_numpy(done).float().to( self.device) done = done.unsqueeze(1) reward = reward.unsqueeze(1) self.actor_control.eval() self.critic_control.eval() with torch.no_grad(): retval = abs( self.bellman_eqn_error(state, action, reward, next_state, done)) + 1e-3 self.critic_control.train() self.actor_control.train() return retval def update_target(self, control, target): for target_param, control_param in zip( target.parameters(), control.parameters()): target_param.data.copy_( self.tau * control_param.data + (1.0 - self.tau) * target_param.data) def end_of_episode(self, final_score): self.step_count = 0 self.noise.sigma *= self.noise_decay self.last_score = final_score self.noise.reset() def save(self, path): torch.save(self.critic_control.state_dict(), path + '-critic.p') torch.save(self.actor_control.state_dict(), path + '-actor.p') def restore(self, path): self.critic_control.load_state_dict( torch.load(path + '-critic.p', map_location='cpu')) self.actor_control.load_state_dict( torch.load(path + '-actor.p', map_location='cpu'))
def train(): experiences_buffer = deque(maxlen=config.MAX_EXPERIENCES_SIZE) word2vec = LightWord2Vec() lang = Lang(word2vec.get_vocab()) actor = ActorCopy(config.EMBEDDING_SIZE, config.STATE_SIZE, lang, word2vec) critic = Critic(config.STATE_SIZE, config.EMBEDDING_SIZE, config.CRITIC_HIDDEN_SIZE) reader = DataSetReader('train') critic_optimizer = torch.optim.Adam(critic.parameters()) critic_criterion = torch.nn.MSELoss() actor_optimizer = torch.optim.Adam(actor.parameters()) if LOAD_INDEX > -1: actor, critic, critic_optimizer, critic_criterion, actor_optimizer, lang = load_model( LOAD_INDEX) if torch.cuda.is_available(): actor.cuda() critic.cuda() for epoch in range(LOAD_INDEX + 1, config.EPOCHS): # training actor for x, y in reader.read(config.TRAIN_BATCH_SIZE): for sentence, target_sentence in zip(x, y): states, actions, probs = actor( sentence, get_possible_actions(lang, sentence)) predicted_sentence = actions[:-1] # Skip None rewards = [ sari_reward(sentence[:i + 1], predicted_sentence[:i + 1], target_sentence[:i + 1]) for i in range( max(len(target_sentence), len(predicted_sentence))) ] + [0] for i in range(len(states) - 1): experiences_buffer.insert( 0, Experience(states[i], actions[i], states[i + 1], rewards[i], probs[i], sentence)) q_estimated = [] q_s = torch.zeros(config.Q_BATCH_SIZE, 1) # training q function exp_length = min(len(experiences_buffer), config.Q_BATCH_SIZE) for idx in range(exp_length): exp = experiences_buffer[random.randint(0, exp_length - 1)] action_emb = word2vec[exp.action] q_estimated.append(critic(exp.state, action_emb)[0, 0]) q_s[idx] = exp.reward if exp.next_state is not None: with torch.no_grad(): q_s[idx] += (config.GAMMA * max([ critic(exp.next_state, word2vec[action]) for action in get_possible_actions(lang, exp.sentence) ]))[0][0] q_estimated = torch.cat(q_estimated).view(-1, 1) q_estimated = q_estimated[:config.Q_BATCH_SIZE] critic_optimizer.zero_grad() loss = critic_criterion(q_s, q_estimated) loss.backward(retain_graph=True) critic_optimizer.step() # updating seq2seq model actor_optimizer.zero_grad() loss = shared_loss(experiences_buffer, q_estimated[:exp_length]) loss.backward() actor_optimizer.step() experiences_buffer.clear() with torch.no_grad(): actor.zero_grad() critic.zero_grad() if epoch % 100 == 0: save_model(epoch, actor, critic, critic_optimizer, critic_criterion, actor_optimizer, lang) print("Finished epoch:", epoch, " loss is ", torch.sum(loss))
class Agent(): """Main DDPG agent that extracts experiences and learns from them""" def __init__(self, state_size, action_size): """ Initializes Agent object. @Param: 1. state_size: dimension of each state. 2. action_size: number of actions. """ self.state_size = state_size self.action_size = action_size #Actor network self.actor_local = Actor(self.state_size, self.action_size).to(device) #local model self.actor_target = Actor(self.state_size, self.action_size).to(device) #target model, TD-target self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) #initialize optimizer using Adam as regularizer for Actor network. #Critic network self.critic_local = Critic(self.state_size, self.action_size).to(device) #local model self.critic_target = Critic(self.state_size, self.action_size).to(device) #target model, TD-target self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) #initialize optimizer using Adam as regularizer for Critic network. #Noise proccess self.noise = OUNoise(action_size) #define Ornstein-Uhlenbeck process #Replay memory self.memory = ReplayBuffer(self.action_size, BUFFER_SIZE, MINI_BATCH) #define experience replay buffer object def step(self, state, action, reward, next_state, done): """ Saves an experience in the replay memory to learn from using random sampling. @Param: 1. state: current state, S. 2. action: action taken based on current state. 3. reward: immediate reward from state, action. 4. next_state: next state, S', from action, a. 5. done: (bool) has the episode terminated? Exracted version for trajectory used in calculating the value for an action, a.""" self.memory.add(state, action, reward, next_state, done) #append to memory buffer #check if enough samples in buffer. if so, learn from experiences, otherwise, keep collecting samples. if(len(self.memory) > MINI_BATCH): experience = self.memory.sample() self.learn(experience) def reset(self): """Resets the noise process to mean""" self.noise.reset() def act(self, state, add_noise=True): """ Returns a deterministic action given current state. @Param: 1. state: current state, S. 2. add_noise: (bool) add bias to agent, default = True (training mode) """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) #typecast to torch.Tensor self.actor_local.eval() #set in evaluation mode with torch.no_grad(): #reset gradients action = self.actor_local(state).cpu().data.numpy() #deterministic action based on Actor's forward pass. self.actor_local.train() #set training mode #If training mode, i.e. add_noise = True, add noise to the model to learn a more accurate policy for current state. if(add_noise): action += self.noise.sample() return action def learn(self, experiences, gamma=GAMMA): """ Learn from a set of experiences picked up from a random sampling of even frequency (not prioritized) of experiences when buffer_size = MINI_BATCH. Updates policy and value parameters accordingly @Param: 1. experiences: (Tuple[torch.Tensor]) set of experiences, trajectory, tau. tuple of (s, a, r, s', done) 2. gamma: immediate reward hyper-parameter, 0.99 by default. """ #Source from: Udacity/DRL #Extrapolate experience into (state, action, reward, next_state, done) tuples states, actions, rewards, next_states, dones = experiences #Update Critic network actions_next = self.actor_target(next_states) # Get predicted next-state actions and Q values from target models Q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # r + γ * Q-values(a,s) # Compute critic loss using MSE Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() #Update Actor Network # Compute actor loss actions_pred = self.actor_local(states) #gets mu(s) actor_loss = -self.critic_local(states, actions_pred).mean() #gets V(s,a) # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. Copies model τ every experience. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class DDPG: def __init__(self, state_size, action_size, random_seed, hyperparams): self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.hyperparams = hyperparams self.actor = Actor(state_size, action_size, random_seed).to(device) self.actor_noise = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optim = optim.Adam(self.actor.parameters(), lr=hyperparams.alpha_actor) self.critic = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optim = optim.Adam( self.critic.parameters(), lr=hyperparams.alpha_critic, weight_decay=hyperparams.weight_decay, ) self.replay_buffer = ReplayBuffer(hyperparams.buffer_size, hyperparams.batch_size, random_seed) self.noise = OUNoise( action_size, random_seed, self.hyperparams.mu, self.hyperparams.theta, self.hyperparams.sigma, ) def step(self, state, action, reward, next_state, done): self.replay_buffer.add(state, action, reward, next_state, done) if len(self.replay_buffer) > self.hyperparams.batch_size: observations = self.replay_buffer.sample() self.update_params(observations) def select_action(self, state, train=True, nn_noise=False): state = torch.from_numpy(state).to(dtype=torch.float32, device=device) self.actor.eval() if nn_noise: action = self.actor_noise(state).cpu().data.numpy() else: action = self.actor(state).cpu().data.numpy() self.actor.train() if train: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset_state() def update_params(self, observations): states, actions, rewards, next_states, dones = observations next_actions = self.actor_target(next_states) next_Q_values = self.critic_target(next_states, next_actions) Q_values = rewards + (self.hyperparams.gamma * next_Q_values * (1 - dones)) expected_Q = self.critic(states, actions) Q_values_loss = F.l1_loss(expected_Q, Q_values) self.critic_optim.zero_grad() Q_values_loss.backward() self.critic_optim.step() policy_loss = -self.critic(states, self.actor(states)) policy_loss = policy_loss.mean() self.actor_optim.zero_grad() policy_loss.backward() self.actor_optim.step() for qtarget_param, qlocal_param in zip(self.critic_target.parameters(), self.critic.parameters()): qtarget_param.data.copy_(self.hyperparams.tau * qlocal_param.data + (1.0 - self.hyperparams.tau) * qtarget_param.data) for target_param, local_param in zip(self.actor_target.parameters(), self.actor.parameters()): target_param.data.copy_(self.hyperparams.tau * local_param.data + (1.0 - self.hyperparams.tau) * target_param.data)
class Agent: def __init__(self, state_size, action_size): self._state_size = state_size self._action_size = action_size # Actor network self._actor_local = Actor(state_size, action_size).to(device) self._actor_target = Actor(state_size, action_size).to(device) self._actor_optimizer = optim.Adam(self._actor_local.parameters()) # Critic network self._critic_local = Critic(state_size, action_size).to(device) self._critic_target = Critic(state_size, action_size).to(device) self._critic_optimizer = optim.Adam(self._critic_local.parameters()) # Memory self._memory = Memory(BUFFER_SIZE) # Do equal weights self.hard_update(self._actor_local, self._actor_target) self.hard_update(self._critic_local, self._critic_target) def step(self, state, action, reward, next_state, done): self._memory.push((state, action, reward, next_state, done)) if len(self._memory) > BATCH_SIZE: for _ in range(UPDATES_PER_STEP): samples = self._memory.sample(BATCH_SIZE) self.learn(samples) def act(self, state): state = torch.from_numpy(state).float().to(device) if binom.rvs(1, PROBABILITY_RAND_STEP): action = np.ndarray((1, ), buffer=np.array(uniform(-1, 1).rvs())) else: self._actor_local.eval() with torch.no_grad(): action = self._actor_local(state).cpu().data.numpy() self._actor_local.train() return np.clip(action, -1, 1) def hard_update(self, local, target): for target_param, local_param in zip(target.parameters(), local.parameters()): target_param.data.copy_(local_param.data) def soft_update(self, local, target, tau): for target_param, local_param in zip(target.parameters(), local.parameters()): target_param.data.copy_(tau * local_param.data + (1 - tau) * target_param.data) def learn(self, samples): states, actions, rewards, next_states, dones = samples actions_next = self._actor_target(next_states) Q_targets_next = self._critic_target(next_states, actions_next) Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones)) Q_expected = self._critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) self._critic_optimizer.zero_grad() critic_loss.backward() self._critic_optimizer.step() actions_pred = self._actor_local(states) actor_loss = -self._critic_local(states, actions_pred).mean() self._actor_optimizer.zero_grad() actor_loss.backward() self._actor_optimizer.step() self.soft_update(self._critic_local, self._critic_target, TAU) self.soft_update(self._actor_local, self._actor_target, TAU) def save(self): torch.save(self._actor_local.state_dict(), ACTOR_PATH) torch.save(self._critic_local.state_dict(), CRITIC_PATH) def load(self): self._actor_local.load_state_dict(torch.load(ACTOR_PATH)) self._actor_local.eval() self._critic_local.load_state_dict(torch.load(CRITIC_PATH)) self._critic_local.eval()