class REINFORCEAgentWithBaseline: def __init__(self, params={}): # parameters to be set from params dict self.γ = None self.policy_estimator = None self.function_approximator = None self.is_continuous = None self.states = [] self.actions = [] self.rewards = [] self.tot_timestep = 0 self.set_params_from_dict(params) # ====== Initialization functions ================================== def set_params_from_dict(self, params={}): self.γ = params.get("discount_factor", 0.9) self.is_continuous = params.get("is_continuous", False) self.initialize_policy_estimator(params.get("policy_estimator_info")) self.initialize_baseline_network( params.get("function_approximator_info")) def initialize_policy_estimator(self, params): self.policy_estimator = CustomNN(params) def initialize_baseline_network(self, params): self.function_approximator = CustomNN(params) # ====== Control related functions ================================= def control(self): self.function_approximator.compute_weights() # ====== Action choice related functions =========================== def choose_action(self, state): if self.is_continuous: action_chosen = self.policy_estimator(state).detach().numpy() else: action_probs = self.policy_estimator(state).detach().numpy() action_chosen = np.random.choice(len(action_probs), p=action_probs) return action_chosen def start(self, state): # choosing the action to take current_action = self.choose_action(state) self.states = np.array([state]) self.actions = np.array([current_action]) self.rewards = [] return current_action def step(self, state, reward): # getting the action current_action = self.choose_action(state) self.save_transition(reward, state, current_action) return current_action def end(self, state, reward): self.save_transition(reward) def save_transition(self, reward, state=None, action=None): self.rewards.append(reward) if state is not None and action is not None: self.states = np.vstack((self.states, state)) self.actions = np.append(self.actions, action) def learn_from_experience(self): # TODO: probleme: comme j'ai pas ajouté le dernier état à la listes des états, on ne prend pas en compte la # dernière transition dans la partie DQN. DQN? discounted_reward = 0 reversed_episode = zip(self.rewards[::-1], self.states[::-1], self.actions[::-1]) for reward, state, action in reversed_episode: state_value = self.function_approximator(state) discounted_reward = reward + self.γ * discounted_reward δ = self.γ * (discounted_reward - state_value.detach()) value_loss = -state_value * δ #value_loss = discounted_reward - state_value self.function_approximator.optimizer.zero_grad() value_loss.backward() self.function_approximator.optimizer.step() self.writer.add_scalar("Agent info/critic loss", value_loss, self.tot_timestep) # plot the policy entropy probs = self.policy_estimator(state).detach().numpy() entropy = -(np.sum(probs * np.log(probs))) self.writer.add_scalar("Agent info/policy entropy", entropy, self.tot_timestep) # on prend le contraire de l'expression pour que notre loss # pénalise au bon moment. loss = -torch.log(self.policy_estimator(state)[action]) * δ self.policy_estimator.optimizer.zero_grad() loss.backward() self.policy_estimator.optimizer.step() self.writer.add_scalar("Agent info/actor loss", loss, self.tot_timestep) wandb.log({ "Agent info/critic loss": value_loss, "Agent info/policy entropy": entropy, "Agent info/actor loss": loss }) def get_state_value_eval(self, state): state_value = self.function_approximator(state).data return state_value
def initialize_function_approximator(self, params: Dict) -> CustomNeuralNetwork: return CustomNeuralNetwork(**params)
def initialize_baseline_network(self, params): self.function_approximator = CustomNN(params)
class REINFORCEAgent: def __init__(self, params={}): # parameters to be set from params dict self.γ = None self.policy_estimator = None self.is_continuous = None self.states = [] self.actions = [] self.rewards = [] self.seed = None self.set_params_from_dict(params) # ====== Initialization functions ================================== def set_params_from_dict(self, params={}): self.γ = params.get("discount_factor", 0.9) self.is_continuous = params.get("is_continuous", False) self.init_seed(params.get("seed", None)) self.initialize_policy_estimator(params.get("policy_estimator_info")) self.is_continuous = params.get("is_continuous", False) def initialize_policy_estimator(self, params): self.policy_estimator = CustomNeuralNetwork(params) def init_seed(self, seed): if seed: self.seed = seed set_random_seed(self.seed) def set_seed(self, seed): if seed: self.seed = seed set_random_seed(self.seed) self.policy_estimator.set_seed(seed) # ====== Action choice related functions =========================== def choose_action(self, state): if self.is_continuous: # TODO: I don't think that's correct action_probs = Categorical(self.policy_estimator(state)) else: action_probs = Categorical(self.policy_estimator(state)) action_chosen = action_probs.sample().numpy() return action_chosen def start(self, state): # choosing the action to take current_action = self.choose_action(state) self.states = np.array([state]) self.actions = np.array([current_action]) self.rewards = [] return current_action def step(self, state, reward): # getting the action values from the function approximator current_action = self.choose_action(state) self.rewards.append(reward) self.states = np.vstack((self.states, state)) self.actions = np.append(self.actions, current_action) return current_action def end(self, state, reward): self.rewards.append(reward) def learn_from_experience(self): """ replays the episode backward and make gradient ascent over the policy """ #self.policy_estimator.optimizer.zero_grad() discounted_reward = 0 reversed_episode = zip(self.rewards[::-1], self.states[::-1], self.actions[::-1]) for reward, state, action in reversed_episode: self.policy_estimator.optimizer.zero_grad() discounted_reward = self.γ(reward + self.γ * discounted_reward) # on prend le contraire de l'expression pour que notre loss # pénalise au bon moment. loss = -torch.log( self.policy_estimator(state)[action]) * discounted_reward loss.backward() self.policy_estimator.optimizer.step()
def initialize_policy_estimator(self, params: Dict) -> CustomNeuralNetwork: return CustomNeuralNetwork(**params)
def initialize_neural_networks(self, nn_params): self.target_net, self.eval_net = (CustomNeuralNetwork(nn_params), CustomNeuralNetwork(nn_params))
def initialize_policy_estimator(self, params): self.policy_estimator = CustomNeuralNetwork(params)
def initialize_function_approximator(self, params): #self.function_approximator = DQN(params) self.function_approximator_eval = CustomNeuralNetwork(params) self.function_approximator_target = CustomNeuralNetwork(params)
class ActorCriticAgent: def __init__(self, params={}): # parameters to be set from params dict self.γ = None self.num_actions = None self.policy_estimator = None self.function_approximator_eval = None self.function_approximator_target = None self.previous_state = None self.previous_action = None #self.rewards = [] self.is_continuous = None # memory parameters self.memory_size = None self.memory = [] self.memory_counter = 0 self.batch_size = None self.update_target_counter = 0 self.update_target_rate = None self.state_dim = None self.seed = None self.tot_timestep = 0 self.set_params_from_dict(params) self.set_other_params() # ====== Initialization functions ================================== def set_params_from_dict(self, params={}): self.γ = params.get("discount_factor", 0.9) self.num_actions = params.get("num_actions", 1) self.is_continuous = params.get("is_continuous", False) self.initialize_policy_estimator(params.get("policy_estimator_info")) self.initialize_function_approximator(params.get( "function_approximator_info")) self.memory_size = params.get("memory_size", 200) self.update_target_rate = params.get("update_target_rate", 50) self.state_dim = params.get("state_dim", 4) self.batch_size = params.get("batch_size", 32) self.seed = params.get("seed", None) def set_other_params(self): # two slots for the states, + 1 for the reward an the last for # the action (per memory slot) self.memory = np.zeros((self.memory_size, 2 * self.state_dim + 3)) def initialize_policy_estimator(self, params): self.policy_estimator = CustomNeuralNetwork(params) def initialize_function_approximator(self, params): #self.function_approximator = DQN(params) self.function_approximator_eval = CustomNeuralNetwork(params) self.function_approximator_target = CustomNeuralNetwork(params) # ====== Memory functions ========================================== def store_transition(self, state, action, reward, next_state, is_terminal): # store a transition (SARS') in the memory is_terminal = [is_terminal] transition = np.hstack((state, [action, reward], next_state, is_terminal)) self.memory[self.memory_counter % self.memory_size, :] = transition self.incr_mem_cnt() def incr_mem_cnt(self): # increment the memory counter and resets it to 0 when reached # the memory size value to avoid a too large value self.memory_counter += 1 #if self.memory_counter == self.memory_size: # self.memory_counter = 0 def sample_memory(self): # Sampling some indices from memory sample_index = np.random.choice(self.memory_size, self.batch_size) # Getting the batch of samples corresponding to those indices # and dividing it into state, action, reward and next state batch_memory = self.memory[sample_index, :] batch_state = torch.tensor(batch_memory[:, :self.state_dim]).float() batch_action = torch.tensor(batch_memory[:, self.state_dim:self.state_dim + 1].astype(int)).float() batch_reward = torch.tensor(batch_memory[:, self.state_dim + 1:self.state_dim + 2]).float() batch_next_state = torch.tensor(batch_memory[:, -self.state_dim-1:-1]).float() batch_is_terminal = torch.tensor(batch_memory[:, -1:]).bool() return batch_state, batch_action, batch_reward, batch_next_state, batch_is_terminal def update_target_net(self): # every n learning cycle, the target networks will be replaced # with the eval networks if self.update_target_counter % self.update_target_rate == 0: self.function_approximator_target.load_state_dict( self.function_approximator_eval.state_dict()) self.update_target_counter += 1 def control(self, state, reward): """ :param state: :param reward: :return: """ # every n learning cycle, the target network will be replaced # with the eval network self.update_target_net() if self.memory_counter > self.memory_size: # getting batch data batch_state, batch_action, batch_reward, batch_next_state, batch_is_terminal = self.sample_memory() prev_state_value = self.function_approximator_eval(batch_state) state_value = self.function_approximator_target(batch_next_state) nu_state_value = torch.zeros(state_value.shape) nu_state_value = torch.masked_fill(state_value, batch_is_terminal, 0.0) δ = batch_reward + self.γ * nu_state_value.detach() - prev_state_value.detach() value_loss = - prev_state_value * δ value_loss = value_loss.mean() self.function_approximator_eval.optimizer.zero_grad() value_loss.backward() self.function_approximator_eval.optimizer.step() # plot the policy entropy probs = self.policy_estimator(state).detach().numpy() entropy = -(np.sum(probs * np.log(probs))) logprob = - torch.log(self.policy_estimator( batch_state).gather(1, batch_action.long())) loss = logprob * δ loss = loss.mean() self.policy_estimator.optimizer.zero_grad() loss.backward() self.policy_estimator.optimizer.step() wandb.log({ "Agent info/critic loss": value_loss, "Agent info/policy entropy": entropy, "Agent info/actor loss": loss }) def vanilla_control(self, state, reward, is_terminal_state): prev_state_value = self.function_approximator_eval(self.previous_state) if is_terminal_state: cur_state_value = torch.tensor([0]) else: cur_state_value = self.function_approximator_eval(state) δ = reward + self.γ * cur_state_value.detach() - prev_state_value.detach() value_loss = - prev_state_value * δ self.function_approximator_eval.optimizer.zero_grad() value_loss.backward() self.function_approximator_eval.optimizer.step() # plot the policy entropy probs = self.policy_estimator(state).detach().numpy() entropy = -(np.sum(probs * np.log(probs))) logprob = - torch.log(self.policy_estimator(self.previous_state)[self.previous_action]) loss = logprob * δ self.policy_estimator.optimizer.zero_grad() loss.backward() self.policy_estimator.optimizer.step() wandb.log({ "Agent info/critic loss": value_loss, "Agent info/policy entropy": entropy, "Agent info/actor loss": loss }) # ====== Action choice related functions =========================== def choose_action(self, state): # TODO fix first if if self.is_continuous: action_chosen = self.policy_estimator(state).detach().numpy() return action_chosen else: action_probs = Categorical(self.policy_estimator(state)) action_chosen = action_probs.sample() return action_chosen.item() # ====== Agent core functions ====================================== def start(self, state): # choosing the action to take current_action = self.choose_action(state) self.previous_action = current_action self.previous_state = state return current_action def step(self, state, reward): # storing the transition in the function approximator memory for further use self.store_transition(self.previous_state, self.previous_action, reward, state, False) # getting the action values from the function approximator current_action = self.choose_action(state) #self.control(state, reward) self.vanilla_control(state, reward, False) self.previous_action = current_action self.previous_state = state return current_action def end(self, state, reward): # storing the transition in the function approximator memory for further use self.store_transition(self.previous_state, self.previous_action, reward, state, True) #self.control(state, reward) self.vanilla_control(state, reward, True) def get_state_value_eval(self, state): if self.num_actions > 1: state_value = self.policy_estimator(state).data else: state_value = self.function_approximator_eval(state).data return state_value
def init_critic(self, params): self.critic = CustomNeuralNetwork(**params) self.critic_target = deepcopy(self.critic)
def init_actor(self, params): self.actor = CustomNeuralNetwork(**params) self.actor_target = deepcopy(self.actor)
class DDPGAgent: def __init__(self, policy_estimator_info: Dict[str, Any], function_approximator_info: Dict[str, Any], memory_info: Dict[str, Any], seed: Optional[int] = 0, num_actions: Optional[int] = 1, state_dim: Optional[int] = 1, update_target_rate: Optional[int] = 50, discount_factor: Optional[float] = 0.995, target_policy_noise: Optional[float] = 0.2, target_noise_clip: Optional[float] = 0.5): self.num_actions = num_actions self.state_dim = state_dim self.seed = self.init_seed(seed) self.logger = None # neural network parameters self.actor = None self.actor_target = None self.critic = None self.critic_target = None self.update_target_rate = update_target_rate self.update_target_counter = 0 self.loss_func = torch.nn.MSELoss() self.γ = discount_factor self.replay_buffer = self.init_memory_buffer(memory_info) self.target_policy_noise = target_policy_noise self.target_noise_clip = target_noise_clip self.tot_timestep = 0 self.init_actor(policy_estimator_info) self.init_critic(function_approximator_info) self.previous_action = None self.previous_obs = None # ====== Initialization functions ================================== def init_actor(self, params): self.actor = CustomNeuralNetwork(**params) self.actor_target = deepcopy(self.actor) def init_critic(self, params): self.critic = CustomNeuralNetwork(**params) self.critic_target = deepcopy(self.critic) def init_memory_buffer(self, params) -> ReplayBuffer: params["obs_dim"] = self.state_dim params["action_dim"] = self.num_actions return ReplayBuffer(**params) def init_seed(self, seed): if seed: set_random_seed(self.seed) return seed def set_seed(self, seed): if seed: self.seed = seed set_random_seed(self.seed) self.function_approximator.set_seed(seed) def set_logger(self, logger: Type[Logger]): self.logger = logger self.logger.wandb_watch([self.actor, self.critic]) def get_discount(self): return self.γ # ====== Action choice related functions =========================== def choose_action(self, obs: torch.Tensor): action = self.actor(obs) noise = np.random.normal(0, self.target_noise_clip) action += noise action = torch.clamp(action, -1, 1) return action # ====== Agent core functions ====================================== def start(self, obs): current_action = self.choose_action(obs) self.previous_action = current_action self.previous_obs = obs return current_action def step(self, obs, reward): # storing the transition in the function approximator memory for further use self.replay_buffer.store_transition(self.previous_obs, self.previous_action, reward, obs, False) # getting the action values from the function approximator current_action = self.choose_action(obs) self.control() self.previous_action = current_action self.previous_obs = obs return current_action def end(self, obs, reward): self.replay_buffer.store_transition(self.previous_obs, self.previous_action, reward, obs, True) self.control() # === functional functions ========================================= def get_action_value(self, state, action=None): # Compute action values from the eval net action_value = self.critic(state) noise = 0 # normal distrib, for exploration action_value = self.critic(state) + noise action_value = torch.clamp(action_value, self.min_action, self.max_action) return action_value # === parameters update functions ================================== def _update_target_net(self): # every n learning cycle, the target network will be replaced # with the eval network self.update_target_counter += 1 if self.update_target_counter == self.update_target_rate: self.critic_target.load_state_dict(self.critic.state_dict()) self.actor_target.load_state_dict(self.actor.state_dict()) self.update_target_counter = 0 # ====== Control related functions ================================= def control(self): self._learn() def _learn(self): if self.replay_buffer.full: self._update_target_net() # getting batch data batch = self.replay_buffer.sample() # compute critic target target_actions = self.actor_target( batch.next_observations).detach() batch_oa = self._concat_obs_action(batch.next_observations, target_actions) q_next = self.critic_target(batch_oa).detach() q_next = (1.0 - batch.dones.float()) * q_next y = batch.rewards + self.γ * q_next batch_oa_eval = self._concat_obs_action(batch.observations, batch.actions) # compute critic eval q_eval = self.critic(batch_oa_eval) # learn critic critic_loss = self.loss_func(q_eval, y) self.critic.backpropagate(critic_loss) actor_eval = self.actor(batch.observations) #with torch.no_grad(): test_oa = self._concat_obs_action(batch.observations, actor_eval) actor_loss = self.critic(test_oa) actor_loss = -actor_loss.mean() self.logger.wandb_log({ "Agent info/critic loss": critic_loss, "Agent info/actor loss": actor_loss }) self.actor.backpropagate(actor_loss) def _concat_obs_action(self, obs: torch.Tensor, action: torch.Tensor) -> torch.Tensor: obs_action = torch.cat((obs, action), 1) #.unsqueeze(1)),1) return obs_action def get_action_value_eval(self, state: torch.Tensor): """for plotting purposes only? """ action = np.random.uniform(-1, 1, 1) action = torch.Tensor(action) state_action = torch.cat((state, action)) action_value = self.critic(state_action).detach().data return action_value def get_action_values_eval(self, state: torch.Tensor, actions: torch.Tensor): """ for plotting purposes only? """ #state = torch.cat((state, state)).unsqueeze(1) state = (state.unsqueeze(1) * torch.ones(len(actions))).T state_action = torch.cat((state, actions.unsqueeze(1)), 1) action_values = self.critic(state_action).data return action_values def _zero_terminal_states(self, q_values: torch.Tensor, dones: torch.Tensor) -> torch.Tensor: """ Zeroes the q values at terminal states """ nu_q_values = torch.zeros(q_values.shape) nu_q_values = torch.masked_fill(q_values, dones, 0.0) return nu_q_values def _create_noise_tensor(self, tensor): # create the nois tensor filled with normal distribution noise = tensor.clone().data.normal_(0, self.target_policy_noise) # clip the normal distribution noise = noise.clamp(-self.target_noise_clip, self.target_noise_clip) return noise def adjust_dims(self, state_dim, action_dim): self.state_dim = state_dim self.num_actions = action_dim self.actor.reinit_layers(state_dim, action_dim) self.actor_target.reinit_layers(state_dim, action_dim) self.critic.reinit_layers(state_dim + action_dim, 1) self.critic_target.reinit_layers(state_dim + action_dim, 1) self.replay_buffer.correct(state_dim, action_dim)