class Random(BaseAgent): def __init__(self, env, buffer_size=int(1e6), device=None): super(Random, self).__init__(env, device) self.replay_buffer = ReplayBuffer(self.obs_dim, self.act_dim, buffer_size) def act(self, obs): return self.env.action_space.sample() def step(self, t): self.episode_timesteps += 1 # Select action randomly or according to policy action = self.env.action_space.sample() # Perform action next_obs, reward, done, _ = self.env.step(action) done_bool = float( done ) # if self.episode_timesteps < self.env._max_episode_steps else 0 # Store data in replay buffer self.replay_buffer.add(copy.deepcopy(self.obs), action, next_obs, reward, done_bool) self.obs = next_obs self.episode_reward += reward # Train agent after collecting sufficient data if done: self.episode_end_handle(t)
def train_new_agent(self, replay_buffer: ReplayBuffer, level: int) -> SacActor: assert level == 1 or level == 2 new_agent = copy.deepcopy(self.level_2_policy if level == 2 else self.level_1_policy) batch_size = 32 optimizer = Adam(new_agent.parameters()) loss_fn = MSELoss() # Go through the data 4 times for i in range(replay_buffer.size() // batch_size * 4): if level == 2: states, desired_goals = replay_buffer.get_batch(batch_size) outputted_goals, _ = new_agent(states, goal=None) loss = loss_fn(outputted_goals, desired_goals) else: # Level 1 states, goals, desired_actions = replay_buffer.get_batch( batch_size) outputted_actions, _ = new_agent(states, goals) loss = loss_fn(outputted_actions, desired_actions) optimizer.zero_grad() loss.backward() optimizer.step() return new_agent
def __init__(self, state_size: int, goal_size: int, action_low: np.ndarray, action_high: np.ndarray, q_bound_low: float, q_bound_high: float, buffer_size: int, batch_size: int, writer, sac_id: Optional[str], use_priority_replay: bool, learning_rate: float, initial_alpha: float): super().__init__() self.action_size = len(action_low) self.use_priority_replay = use_priority_replay self.critic1 = SacCritic(state_size, goal_size, self.action_size, q_bound_low, q_bound_high) self.critic1_target = copy.deepcopy(self.critic1) self.critic2 = SacCritic(state_size, goal_size, self.action_size, q_bound_low, q_bound_high) self.critic2_target = copy.deepcopy(self.critic2) self.actor = SacActor(state_size, goal_size, self.action_size, action_low=action_low, action_high=action_high) self.actor_target = copy.deepcopy(self.actor) initial_log_alpha = math.log(initial_alpha) self.alpha = initial_alpha self.log_alpha = torch.tensor([initial_log_alpha], requires_grad=True) self.target_entropy = -np.prod(action_low.shape).item() # Use heuristic value from SAC paper self.critic_optimizer = Adam(list(self.critic1.parameters()) + list(self.critic2.parameters()), lr=learning_rate) self.actor_optimizer = Adam(self.actor.parameters(), lr=learning_rate) self.alpha_optimizer = Adam([self.log_alpha], lr=learning_rate) # Optimization for speed: don't compute gradients for the target networks, since we will never use them for network in [self.actor_target, self.critic1_target, self.critic2_target]: for parameter in network.parameters(): parameter.requires_grad = False self.polyak = 0.995 # 8 transitions dims: (current_state, action, env_reward, total_reward, next_state, transition_reward, current_goal, discount) # NOTE: they use some more complicated logic (which depends on the level) to determinate the size of the buffer # TODO: this is a simplfication. See if it works anyway. # self.buffer = PrioritizedReplayBuffer(buffer_size, num_transition_dims=8) if use_priority_replay: self.buffer = PrioritizedReplayBuffer(buffer_size, num_transition_dims=8) else: self.buffer = ReplayBuffer(buffer_size, num_transition_dims=8) self.batch_size = batch_size self.q_bound_low = q_bound_low self.q_bound_high = q_bound_high self.step_number = 0 self.use_tensorboard = (writer is not None) self.writer = writer self.sac_id = sac_id
def __init__(self, env, lr=1e-3, gamma=0.99, tau=0.005, buffer_size=int(1e6), start_timesteps=5000, expl_noise=0.1, batch_size=128, policy_noise=0.2, noise_clip=0.5, policy_freq=2, device=None, **kwargs): super(Custom, self).__init__(env, device) self.actor = GaussianActor(self.obs_dim, self.act_dim, self.act_limit, **kwargs).to(self.device) self.actor_target = copy.deepcopy(self.actor) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr) self.behavior = GaussianActor(self.obs_dim, self.act_dim, self.act_limit, **kwargs).to(self.device) self.behavior_optimizer = torch.optim.Adam(self.behavior.parameters(), lr=lr) self.critic = DoubleQvalueCritic(self.obs_dim, self.act_dim, **kwargs).to(self.device) self.critic_target = copy.deepcopy(self.critic) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=lr) self.replay_buffer = ReplayBuffer(self.obs_dim, self.act_dim, buffer_size) self.start_timesteps = start_timesteps self.expl_noise = expl_noise self.batch_size = batch_size self.lr = lr self.gamma = gamma self.tau = tau self.policy_noise = policy_noise self.noise_clip = noise_clip self.policy_freq = policy_freq self.total_it = 0 self.c_loss, self.a_loss = [], []
def main(): """메인.""" # 환경 생성 env = make_env(ENV_NAME) net = DQN(env.observation_space.shape, env.action_space.n) net.apply(weights_init) tgt_net = DQN(env.observation_space.shape, env.action_space.n) tgt_net.load_state_dict(net.state_dict()) if PRIORITIZED: memory = PrioReplayBuffer(PRIO_BUF_SIZE) else: memory = ReplayBuffer(SEND_SIZE) # 고정 eps로 에이전트 생성 epsilon = EPS_BASE**(1 + actor_id / (num_actor - 1) * EPS_ALPHA) agent = Agent(env, memory, epsilon, PRIORITIZED) log("Actor {} - epsilon {:.5f}".format(actor_id, epsilon)) # zmq 초기화 context, lrn_sock, buf_sock = init_zmq() # 러너에게서 기본 가중치 받고 시작 net, tgt_net = receive_model(lrn_sock, net, tgt_net, True) # # 시뮬레이션 # episode = frame_idx = 0 p_time = p_frame = None p_reward = -50.0 while True: frame_idx += 1 # 스텝 진행 (에피소드 종료면 reset까지) reward = agent.play_step(net, tgt_net, epsilon, frame_idx) # 리워드가 있는 경우 (에피소드 종료) if reward is not None: episode += 1 p_reward = reward # 보내기 if frame_idx % SEND_FREQ == 0: # 학습관련 정보 if p_time is None: speed = 0.0 else: speed = (frame_idx - p_frame) / (time.time() - p_time) info = ActorInfo(episode, frame_idx, p_reward, speed) # 리플레이 정보와 정보 전송 agent.send_replay(buf_sock, info) # 동작 선택 횟수 agent.show_action_rate() p_time = time.time() p_frame = frame_idx # 새로운 모델 받기 net, tgt_net = receive_model(lrn_sock, net, tgt_net, False)
def main(): """메인.""" # 환경 생성 env = make_env(ENV_NAME) set_random_seed(env, actor_id) net = A2C(env.observation_space.shape, env.action_space.n) net.apply(weights_init) memory = ReplayBuffer(SEND_SIZE) agent = Agent(env, memory, NUM_UNROLL) log("Actor {}".format(actor_id)) # zmq 초기화 context, lrn_sock, buf_sock = init_zmq() # 러너에게서 기본 가중치 받고 시작 net = receive_model(lrn_sock, net, True) # # 시뮬레이션 # episode = frame_idx = 0 p_time = p_frame = None p_reward = -50.0 while True: frame_idx += 1 # 스텝 진행 (에피소드 종료면 reset까지) ep_reward = agent.play_step(net, frame_idx) # 에피소드 리워드가 있는 경우 (에피소드 종료) if ep_reward is not None: episode += 1 p_reward = ep_reward log("Episode finished! reward {}".format(ep_reward)) # 보내기 if frame_idx % SEND_FREQ == 0: # 학습관련 정보 if p_time is None: speed = 0.0 else: speed = (frame_idx - p_frame) / (time.time() - p_time) info = ActorInfo(episode, frame_idx, p_reward, speed) # 리플레이 정보와 정보 전송 agent.send_replay(buf_sock, info) # 동작 선택 횟수 agent.show_action_rate() p_time = time.time() p_frame = frame_idx # 새로운 모델 받기 net = receive_model(lrn_sock, net, False)
def __init__(self, env, buffer_size=int(1e6), gamma=0.99, tau=0.005, lr=1e-3, start_timesteps=1000, actor_train_freq=2, batch_size=128, init_temperature=0.1, device=None): super(SAC, self).__init__(env, device) self.actor = SquashedGaussianActor(self.obs_dim, self.act_dim, self.act_limit).to(self.device) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr) self.critic = DoubleQvalueCritic(self.obs_dim, self.act_dim).to(self.device) self.critic_target = copy.deepcopy(self.critic) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=lr) # Adjustable alpha self.log_alpha = torch.tensor(np.log(init_temperature), requires_grad=True, device=self.device) self.target_entropy = -torch.prod( torch.Tensor(self.env.action_space.shape).to(self.device)).item() self.alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=1e-4, betas=(0.5, 0.999)) self.replay_buffer = ReplayBuffer(buffer_size) self.start_timesteps = start_timesteps self.tau = tau self.gamma = gamma self.alpha = self.log_alpha.exp() self.actor_train_freq = actor_train_freq self.batch_size = batch_size
def __init__(self, env, lr=1e-3, gamma=0.99, tau=0.005, buffer_size=int(1e6), start_timesteps=1000, expl_noise=0.1, batch_size=256, device=None): super(DDPG, self).__init__(env, device) self.actor = DeterministicActor(self.obs_dim, self.act_dim, self.act_limit).to(self.device) self.actor_target = copy.deepcopy(self.actor) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr) self.critic = QvalueCritic(self.obs_dim, self.act_dim).to(self.device) self.critic_target = copy.deepcopy(self.critic) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=lr) self.replay_buffer = ReplayBuffer(self.obs_dim, self.act_dim, buffer_size) self.start_timesteps = start_timesteps self.expl_noise = expl_noise self.batch_size = batch_size self.lr = lr self.gamma = gamma self.tau = tau
def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Keep track of time step self.t = 0 # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memoryimport matplotlib.pyplot as pltA self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
def __init__(self, state_size: int, goal_size: int, action_range: np.ndarray, action_center: np.ndarray, q_bound: float, buffer_size: int, batch_size: int): super().__init__() action_size = len(action_range) # Important: there are no target networks on purpose, because the HAC paper # found they were not very useful self.critic = Critic(state_size, goal_size, action_size, q_bound) self.actor = Actor(state_size, goal_size, action_size, action_range, action_center) # https://github.com/andrew-j-levy/Hierarchical-Actor-Critc-HAC-/blob/master/critic.py#L8 self.critic_optimizer = Adam(self.critic.parameters(), lr=0.001) # https://github.com/andrew-j-levy/Hierarchical-Actor-Critc-HAC-/blob/master/actor.py#L15 self.actor_optimizer = Adam(self.actor.parameters(), lr=0.001) # There's 6 dimensions in a transition: (current_state, action, penalty, next_state, current_goal, discount) # NOTE: they use some more complicated logic (which depends on the level) to determinate the size of the buffer # TODO: this is a simplfication. See if it works anyway. self.buffer = ReplayBuffer(buffer_size, num_transition_dims=6) self.batch_size = batch_size self.q_bound = q_bound
def teach_hrl_agent(self) -> Tuple[SacActor, SacActor]: current_agent_1 = self.level_1_policy current_agent_2 = self.level_2_policy replay_buffer_1 = ReplayBuffer(max_size=2_000_000, num_transition_dims=3) replay_buffer_2 = ReplayBuffer(max_size=2_000_000, num_transition_dims=2) for i in range(self.num_agents_taught): print(f"DAgger-Hierarchical: training step {i}") with torch.no_grad(): new_experiences = [] for _ in tqdm(range(self.num_trajectories)): done = False state = self.env.reset() while not done: if random.random() < self.probability_use_level_1: goal, logprob = self.level_2_policy.sample_actions( state, goal=None, compute_log_prob=True) end_state, done = self.rollout(state, goal) else: num_steps = random.randint( int(0.75 * self.horizon_length), self.horizon_length) level_1_transitions, end_state, done = self.expert_rollout( state, num_steps) new_experiences.append((state, end_state)) replay_buffer_1.add_many(level_1_transitions) state = end_state replay_buffer_2.add_many(new_experiences) current_agent_1 = self.train_new_agent(replay_buffer_1, level=1) current_agent_2 = self.train_new_agent(replay_buffer_2, level=2) self.evaluate_agent(current_agent_1, current_agent_2, num_episodes_to_render=2) return current_agent_1, current_agent_2
class SacEntropyAdjustment(nn.Module): def __init__(self, state_size: int, goal_size: int, action_low: np.ndarray, action_high: np.ndarray, q_bound_low: float, q_bound_high: float, buffer_size: int, batch_size: int, writer, sac_id: Optional[str], use_priority_replay: bool, learning_rate: float, initial_alpha: float): super().__init__() self.action_size = len(action_low) self.use_priority_replay = use_priority_replay self.critic1 = SacCritic(state_size, goal_size, self.action_size, q_bound_low, q_bound_high) self.critic1_target = copy.deepcopy(self.critic1) self.critic2 = SacCritic(state_size, goal_size, self.action_size, q_bound_low, q_bound_high) self.critic2_target = copy.deepcopy(self.critic2) self.actor = SacActor(state_size, goal_size, self.action_size, action_low=action_low, action_high=action_high) self.actor_target = copy.deepcopy(self.actor) initial_log_alpha = math.log(initial_alpha) self.alpha = initial_alpha self.log_alpha = torch.tensor([initial_log_alpha], requires_grad=True) self.target_entropy = -np.prod(action_low.shape).item() # Use heuristic value from SAC paper self.critic_optimizer = Adam(list(self.critic1.parameters()) + list(self.critic2.parameters()), lr=learning_rate) self.actor_optimizer = Adam(self.actor.parameters(), lr=learning_rate) self.alpha_optimizer = Adam([self.log_alpha], lr=learning_rate) # Optimization for speed: don't compute gradients for the target networks, since we will never use them for network in [self.actor_target, self.critic1_target, self.critic2_target]: for parameter in network.parameters(): parameter.requires_grad = False self.polyak = 0.995 # 8 transitions dims: (current_state, action, env_reward, total_reward, next_state, transition_reward, current_goal, discount) # NOTE: they use some more complicated logic (which depends on the level) to determinate the size of the buffer # TODO: this is a simplfication. See if it works anyway. # self.buffer = PrioritizedReplayBuffer(buffer_size, num_transition_dims=8) if use_priority_replay: self.buffer = PrioritizedReplayBuffer(buffer_size, num_transition_dims=8) else: self.buffer = ReplayBuffer(buffer_size, num_transition_dims=8) self.batch_size = batch_size self.q_bound_low = q_bound_low self.q_bound_high = q_bound_high self.step_number = 0 self.use_tensorboard = (writer is not None) self.writer = writer self.sac_id = sac_id def get_error(self, transition: tuple) -> float: state, action, _, _, next_state, reward, goal, discount = [permissive_get_tensor(x) for x in transition] target_q_values, values1, values2 = self.get_target_q_values(reward, discount, next_state, goal) predicted_q_values1 = self.critic1.forward(state, goal, action) predicted_q_values2 = self.critic2.forward(state, goal, action) return self.get_td_error(predicted_q_values1, predicted_q_values2, target_q_values).item() def get_td_error(self, predicted_q_values1: torch.Tensor, predicted_q_values2: torch.Tensor, target_q_values: torch.Tensor) -> torch.Tensor: return (target_q_values - predicted_q_values1).abs() + (target_q_values - predicted_q_values2).abs() def add_to_buffer(self, transition: tuple): assert len(transition[1]) == self.action_size if self.use_priority_replay: # noinspection PyArgumentList self.buffer.add(error=self.get_error(transition), transition=transition) else: self.buffer.add(transition) def add_many_to_buffer(self, transitions: List[tuple]): for transition in transitions: self.add_to_buffer(transition) def sample_action(self, state: np.ndarray, goal: np.ndarray, deterministic=False) -> np.ndarray: with torch.no_grad(): return self.actor.sample_actions(state, goal, deterministic, compute_log_prob=False) def learn(self, num_updates: int): # If there's not enough transitions to fill a batch, we don't do anything if self.buffer.size() < self.batch_size: return for i in range(num_updates): # Step 0: get the transitions and the next actions for the next state states, actions, env_rewards, total_env_rewards, next_states, rewards, goals, discounts = self.buffer.get_batch(self.batch_size) # Step 1: Update the log_alpha and alpha self.alpha_optimizer.zero_grad() actions_states, log_actions_states = self.actor(states, goals) alpha_loss = -(self.log_alpha * (log_actions_states + self.target_entropy).detach()).mean() self.alpha = self.log_alpha.exp() alpha_loss.backward() self.alpha_optimizer.step() # Step 2: improve the critic self.critic_optimizer.zero_grad() target_q_values, values1, values2 = self.get_target_q_values(rewards, discounts, next_states, goals) predicted_q_values1 = self.critic1(states, goals, actions) predicted_q_values2 = self.critic2(states, goals, actions) # Update priority in Priority Replay Buffer if needed if self.use_priority_replay: errors = self.get_td_error(predicted_q_values1, predicted_q_values2, target_q_values) for j in range(self.batch_size): index = self.buffer.last_indices[j] self.buffer.update(index, errors[j].item()) # critic_loss = F.smooth_l1_loss(predicted_q_values1, target_q_values) + \ # F.smooth_l1_loss(predicted_q_values2, target_q_values) critic_loss = ((predicted_q_values1 - target_q_values) ** 2).mean() + \ ((predicted_q_values2 - target_q_values) ** 2).mean() critic_loss.backward() self.critic_optimizer.step() # Step 3: improve the actor # Freeze Q-network so you don't waste computational effort # computing gradients for it during the policy learning step. # TODO: for some reason, if I do this, then I get this error when I do actor_loss.backward() # "RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn" # This does not happen in my other DDPG code and I don't know why. # TODO: figure it out # for p in self.critic.parameters(): # p.requires_grad = False self.actor_optimizer.zero_grad() # We want to maximize the q-values of the actions (and therefore minimize -Q_values) new_actions, log_new_actions = self.actor(states, goals) values1 = self.critic1(states, goals, new_actions) values2 = self.critic2(states, goals, new_actions) actor_loss = (self.alpha * log_new_actions - torch.min(values1, values2)).mean() actor_loss.backward() self.actor_optimizer.step() # Log things on tensorboard and console if needed if self.use_tensorboard and i == 0: self.writer.add_scalar(f"Loss/({self.sac_id}) Policy", actor_loss.item(), self.step_number) self.writer.add_scalar(f"Loss/({self.sac_id}) Value", critic_loss.item(), self.step_number) self.writer.add_scalar(f"Loss/({self.sac_id}) Log Prob", log_new_actions[0].item(), self.step_number) self.writer.add_scalar(f"Loss/({self.sac_id}) Target", target_q_values[0].item(), self.step_number) self.writer.add_scalar(f"Loss/({self.sac_id}) Predicted 1", predicted_q_values1[0].item(), self.step_number) self.writer.add_scalar(f"Loss/({self.sac_id}) Values 1", values2[0].item(), self.step_number) self.writer.add_scalar(f"Loss/({self.sac_id}) Predicted 2", predicted_q_values2[0].item(), self.step_number) self.writer.add_scalar(f"Loss/({self.sac_id}) Values 2", values1[0].item(), self.step_number) self.writer.add_scalar(f"Loss/({self.sac_id}) Reward", rewards[0].item(), self.step_number) # Unfreeze Q-network so you can optimize it at next DDPG step. # for p in self.critic.parameters(): # p.requires_grad = True polyak_average(self.actor, self.actor_target, self.polyak) polyak_average(self.critic1, self.critic1_target, self.polyak) polyak_average(self.critic2, self.critic2_target, self.polyak) self.step_number += 1 def get_target_q_values(self, rewards: torch.Tensor, discounts: torch.Tensor, next_states: torch.Tensor, goals: torch.Tensor): with torch.no_grad(): # No need to compute gradients for this # The actions for the next state come from **current** policy (not from the target policy) next_actions, log_next_actions = self.actor(next_states, goals) values1 = self.critic1_target(next_states, goals, next_actions) values2 = self.critic2_target(next_states, goals, next_actions) values_next_state = torch.min(values1, values2).squeeze() target_q_values = rewards + discounts * (values_next_state - self.alpha * log_next_actions) if target_q_values.ndim != 0: target_q_values = target_q_values.unsqueeze(1) # We clamp the Q-values to be in [-H, 0] if we're not at the top level. Why would this be needed given that the critic already # outputs values in this range? Well, it's true, the critic does do that, but the target # expression is r + alpha * Q(s', a') and that might go outside of [-H, 0]. We don't want # that to happen, so we clamp it to the range. This will thus incentivize the critic to predict # values in [-H, 0], but since the critic can already only output values in that range, it's perfect. # Of course, this is not a coincidence but done by design. if self.q_bound_low is not None: # It's None for the top-level, since we don't know in advance the total reward range target_q_values = torch.clamp(target_q_values, min=self.q_bound_low, max=self.q_bound_high) return target_q_values, values1, values2
class DDPG(nn.Module): def __init__(self, state_size: int, goal_size: int, action_range: np.ndarray, action_center: np.ndarray, q_bound: float, buffer_size: int, batch_size: int): super().__init__() action_size = len(action_range) # Important: there are no target networks on purpose, because the HAC paper # found they were not very useful self.critic = Critic(state_size, goal_size, action_size, q_bound) self.actor = Actor(state_size, goal_size, action_size, action_range, action_center) # https://github.com/andrew-j-levy/Hierarchical-Actor-Critc-HAC-/blob/master/critic.py#L8 self.critic_optimizer = Adam(self.critic.parameters(), lr=0.001) # https://github.com/andrew-j-levy/Hierarchical-Actor-Critc-HAC-/blob/master/actor.py#L15 self.actor_optimizer = Adam(self.actor.parameters(), lr=0.001) # There's 6 dimensions in a transition: (current_state, action, penalty, next_state, current_goal, discount) # NOTE: they use some more complicated logic (which depends on the level) to determinate the size of the buffer # TODO: this is a simplfication. See if it works anyway. self.buffer = ReplayBuffer(buffer_size, num_transition_dims=6) self.batch_size = batch_size self.q_bound = q_bound def add_to_buffer(self, transition: tuple): self.buffer.add(transition) def add_many_to_buffer(self, transitions: List[tuple]): self.buffer.add_many(transitions) def sample_action(self, state: np.ndarray, goal: np.ndarray): with torch.no_grad(): return self.actor(state, goal).numpy() def learn(self, num_updates: int): # If there's not enough transitions to fill a batch, we don't do anything if self.buffer.size() < self.batch_size: return for i in range(num_updates): # Step 1: get the transitions and the next actions for the next state states, actions, rewards, next_states, goals, discounts = self.buffer.get_batch( self.batch_size) next_actions = self.actor(next_states, goals) # Step 2: improve the critic with torch.no_grad(): # No need to compute gradients for this target_q_values = rewards + discounts * self.critic( next_states, goals, next_actions).squeeze() target_q_values = target_q_values.unsqueeze(1) # We clamp the Q-values to be in [-H, 0]. Why would this be needed given that the critic already # outputs values in this range? Well, it's true, the critic does do that, but the target # expression is r + alpha * Q(s', a') and that might go outside of [-H, 0]. We don't want # that to happen, so we clamp it to the range. This will thus incentivize the critic to predict # values in [-H, 0], but since the critic can already only output values in that range, it's perfect. # Of course, this is not a coincidence but done by design. target_q_values = torch.clamp(target_q_values, min=self.q_bound, max=0.0) self.critic_optimizer.zero_grad() predicted_q_values = self.critic(states, goals, actions) critic_loss = F.mse_loss(predicted_q_values, target_q_values) critic_loss.backward() self.critic_optimizer.step() # Step 3: improve the actor # Freeze Q-network so you don't waste computational effort # computing gradients for it during the policy learning step. # TODO: for some reason, if I do this, then I get this error when I do actor_loss.backward() # "RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn" # This does not happen in my other DDPG code and I don't know why. # TODO: figure it out # for p in self.critic.parameters(): # p.requires_grad = False # We want to maximize the q-values of the actions (and therefore minimize -Q_values) self.actor_optimizer.zero_grad() new_actions = self.actor(states, goals) actor_loss = -self.critic(states, goals, new_actions).mean() actor_loss.backward() self.actor_optimizer.step()
class DDPG(BaseAgent): def __init__(self, env, lr=1e-3, gamma=0.99, tau=0.005, buffer_size=int(1e6), start_timesteps=1000, expl_noise=0.1, batch_size=256, device=None): super(DDPG, self).__init__(env, device) self.actor = DeterministicActor(self.obs_dim, self.act_dim, self.act_limit).to(self.device) self.actor_target = copy.deepcopy(self.actor) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr) self.critic = QvalueCritic(self.obs_dim, self.act_dim).to(self.device) self.critic_target = copy.deepcopy(self.critic) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=lr) self.replay_buffer = ReplayBuffer(self.obs_dim, self.act_dim, buffer_size) self.start_timesteps = start_timesteps self.expl_noise = expl_noise self.batch_size = batch_size self.lr = lr self.gamma = gamma self.tau = tau def act(self, obs): obs = torch.tensor(obs, dtype=torch.float32, device=self.device) return self.actor(obs).cpu().data.numpy().flatten() def train(self): obs, action, reward, next_obs, done = self.replay_buffer.sample(self.batch_size) # Compute the target Q value target_Q = self.critic_target(next_obs, self.actor_target(next_obs)) target_Q = reward + (1 - done) * self.gamma * target_Q.detach() # Get current Q estimate current_Q = self.critic(obs, action) # Compute critic loss critic_loss = F.mse_loss(current_Q, target_Q) # Optimize the critic self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # Compute actor loss actor_loss = -self.critic(obs, self.actor(obs)).mean() # Optimize the actor self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update the frozen target models for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) def step(self, t): self.episode_timesteps += 1 # Select action randomly or according to policy if t < self.start_timesteps: action = self.env.action_space.sample() else: action = ( self.actor.act(torch.tensor(self.obs, dtype=torch.float32, device=self.device)) + np.random.normal(0, self.act_limit * self.expl_noise, size=self.act_dim) ).clip(-self.act_limit, self.act_limit) # Perform action next_obs, reward, done, _ = self.env.step(action) done_bool = float(done)# if self.episode_timesteps < self.env._max_episode_steps else 0 # Store data in replay buffer self.replay_buffer.add(copy.deepcopy(self.obs), action, reward, next_obs, done_bool) self.obs = next_obs self.episode_reward += reward # Train agent after collecting sufficient data if t > self.start_timesteps: self.train() if done: self.episode_end_handle(t)
class TD3(BaseAgent): def __init__(self, env, lr=3e-4, gamma=0.99, tau=0.005, buffer_size=int(1e6), start_timesteps=1000, expl_noise=0.1, batch_size=100, policy_noise=0.2, noise_clip=0.5, policy_freq=2, device=None, **kwargs): super(TD3, self).__init__(env, device) self.actor = DeterministicActor(self.obs_dim, self.act_dim, self.act_limit, **kwargs).to(self.device) self.actor_target = copy.deepcopy(self.actor) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr) self.critic = DoubleQvalueCritic(self.obs_dim, self.act_dim, **kwargs).to(self.device) self.critic_target = copy.deepcopy(self.critic) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=lr) self.replay_buffer = ReplayBuffer(self.obs_dim, self.act_dim, buffer_size) self.start_timesteps = start_timesteps self.expl_noise = expl_noise self.batch_size = batch_size self.lr = lr self.gamma = gamma self.tau = tau self.policy_noise = policy_noise self.noise_clip = noise_clip self.policy_freq = policy_freq self.total_it = 0 def act(self, obs): obs = torch.tensor(obs, dtype=torch.float32, device=self.device) return self.actor(obs).cpu().data.numpy().flatten() def train(self): obs, action, reward, next_obs, done = self.replay_buffer.sample( self.batch_size) self.total_it += 1 cur_action = self.actor(obs) with torch.no_grad(): # Select action according to policy and add clipped noise noise = (torch.randn_like(action) * self.policy_noise).clamp( -self.noise_clip, self.noise_clip) next_action = (self.actor_target(next_obs) + noise).clamp( -self.act_limit, self.act_limit) # Compute the target Q value target_Q1, target_Q2 = self.critic_target(next_obs, next_action) target_Q = torch.min(target_Q1, target_Q2) target_Q = reward + (1 - done) * self.gamma * target_Q # Get current Q estimates current_Q1, current_Q2 = self.critic(obs, action) # Compute critic loss critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss( current_Q2, target_Q) # Optimize the critic self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # Delayed policy updates if self.total_it % self.policy_freq == 0: for param in self.critic.parameters(): param.requires_grad = False # Compute actor losse actor_loss = -self.critic.Q1(obs, cur_action).mean() #target = 1 / (2 * 0.2) * grad(self.critic.Q1(obs, cur_action).mean(), cur_action)[0].detach() + self.actor_target(obs).detach() #actor_loss = F.mse_loss(target, cur_action) # Optimize the actor self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() for param in self.critic.parameters(): param.requires_grad = True # Update the frozen target models for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) def step(self, t): self.episode_timesteps += 1 # Select action randomly or according to policy if t < self.start_timesteps: action = self.env.action_space.sample() else: action = (self.actor.act( torch.tensor(self.obs, dtype=torch.float32, device=self.device)) + np.random.normal(0, self.act_limit * self.expl_noise, size=self.act_dim)).clip( -self.act_limit, self.act_limit) # Perform action next_obs, reward, done, _ = self.env.step(action) done_bool = float( done ) # if self.episode_timesteps < self.env._max_episode_steps else 0 # Store data in replay buffer self.replay_buffer.add(copy.deepcopy(self.obs), action, reward, next_obs, done_bool) self.obs = next_obs self.episode_reward += reward # Train agent after collecting sufficient data # TODO: extra training to compensate for inti_timesteps? if t > self.start_timesteps: self.train() if done: self.episode_end_handle(t)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Keep track of time step self.t = 0 # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memoryimport matplotlib.pyplot as pltA self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # update time step self.t += 1 # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: if self.t % UPDATE_EVERY == 0: for _ in range(NUM_UPDATES): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic lossimport matplotlib.pyplot as pltA Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
masks = torch.tensor(done).float().reshape(-1, 1) expected = rewards + 0.99 * max_q_vals * masks loss = torch.mean( torch.pow(q_values - expected, 2)) #F.smooth_l1_loss(selected_q_values, expected) return loss env = Env('CartPole-v0') agent = DQN(env.sizes) adam = optim.Adam(agent.parameters(), 1e-3) memory = ReplayBuffer(1000) epochs = 20000 batch_size = 32 max_eps = 1. min_eps = 0.01 eps_decay = 8000 eps = lambda max_eps, min_eps, eps_decay, epoch: min_eps + ( max_eps - min_eps) * np.exp(-1. * epoch / eps_decay) recap = [] episode_mean_reward = 0 for episode in range(epochs):
"""액터별로 정보 평균.""" result = {} for ano, infos in ainfos.items(): infos = ActorInfo(*zip(*infos)) tmp = ActorInfo(*np.mean(infos, axis=1)) info = ActorInfo(tmp.episode, int(tmp.frame), tmp.reward, tmp.speed) result[ano] = info return result log = get_logger() if PRIORITIZED: memory = PrioReplayBuffer(BUFFER_SIZE) else: memory = ReplayBuffer(BUFFER_SIZE) context = zmq.Context() # 액터/러너에게서 받을 소켓 recv = context.socket(zmq.PULL) recv.bind("tcp://*:5558") # 러너에게 보낼 소켓 learner = context.socket(zmq.REP) learner.bind("tcp://*:5555") actor_infos = defaultdict(lambda: deque(maxlen=300)) # 액터들이 보낸 정보 # 반복 while True:
def __init__(self, env, buffer_size=int(1e6), device=None): super(Random, self).__init__(env, device) self.replay_buffer = ReplayBuffer(self.obs_dim, self.act_dim, buffer_size)
def average_actor_info(ainfos): """액터별로 정보 평균.""" result = {} for ano, infos in ainfos.items(): infos = ActorInfo(*zip(*infos)) tmp = ActorInfo(*np.mean(infos, axis=1)) info = ActorInfo(tmp.episode, int(tmp.frame), tmp.reward, tmp.speed) result[ano] = info return result log = get_logger() memory = ReplayBuffer(BUFFER_SIZE) context = zmq.Context() # 액터/러너에게서 받을 소켓 recv = context.socket(zmq.PULL) recv.bind("tcp://*:6558") # 러너에게 보낼 소켓 learner = context.socket(zmq.REP) learner.bind("tcp://*:6555") actor_infos = defaultdict(lambda: deque(maxlen=300)) # 액터들이 보낸 정보 # 반복 while True:
class Custom(BaseAgent): def __init__(self, env, lr=1e-3, gamma=0.99, tau=0.005, buffer_size=int(1e6), start_timesteps=5000, expl_noise=0.1, batch_size=128, policy_noise=0.2, noise_clip=0.5, policy_freq=2, device=None, **kwargs): super(Custom, self).__init__(env, device) self.actor = GaussianActor(self.obs_dim, self.act_dim, self.act_limit, **kwargs).to(self.device) self.actor_target = copy.deepcopy(self.actor) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr) self.behavior = GaussianActor(self.obs_dim, self.act_dim, self.act_limit, **kwargs).to(self.device) self.behavior_optimizer = torch.optim.Adam(self.behavior.parameters(), lr=lr) self.critic = DoubleQvalueCritic(self.obs_dim, self.act_dim, **kwargs).to(self.device) self.critic_target = copy.deepcopy(self.critic) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=lr) self.replay_buffer = ReplayBuffer(self.obs_dim, self.act_dim, buffer_size) self.start_timesteps = start_timesteps self.expl_noise = expl_noise self.batch_size = batch_size self.lr = lr self.gamma = gamma self.tau = tau self.policy_noise = policy_noise self.noise_clip = noise_clip self.policy_freq = policy_freq self.total_it = 0 self.c_loss, self.a_loss = [], [] def act(self, obs): obs = torch.tensor(obs, dtype=torch.float32, device=self.device) return self.actor(obs, True).cpu().data.numpy().flatten() def behavior_init(self, iteration=1000): obs, action, reward, next_obs, done = self.replay_buffer.sample( self.batch_size) def train(self): obs, action, reward, next_obs, done = self.replay_buffer.sample( self.batch_size) self.total_it += 1 cur_action = self.actor(obs) with torch.no_grad(): # Select action according to policy and add clipped noise noise = (torch.randn_like(action) * self.policy_noise).clamp( -self.noise_clip, self.noise_clip) next_action = (self.actor_target(next_obs) + noise).clamp( -self.act_limit, self.act_limit) # Compute the target Q value target_Q1, target_Q2 = self.critic_target(next_obs, next_action) target_Q = torch.min(target_Q1, target_Q2) target_Q = reward + (1 - done) * self.gamma * target_Q # Get current Q estimates current_Q1, current_Q2 = self.critic(obs, action) # Compute critic loss critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss( current_Q2, target_Q) # Optimize the critic self.critic_optimizer.zero_grad() critic_loss.backward() c_loss = critic_loss.item() self.critic_optimizer.step() a_loss = 0 # Delayed policy updates if self.total_it % self.policy_freq == 0: for param in self.critic.parameters(): param.requires_grad = False # Compute actor losse actor_loss = -self.critic.Q1(obs, cur_action).mean() # Optimize the actor self.actor_optimizer.zero_grad() actor_loss.backward() a_loss = actor_loss.item() self.actor_optimizer.step() for param in self.critic.parameters(): param.requires_grad = True # Update the frozen target models for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) return c_loss, a_loss def step(self, t): c, a = self.train() self.c_loss.append(c) self.a_loss.append(a) if t % 100 == 0: #self.evaluate(self.env) print( f'Iteration {t}: Critic Loss: {np.mean(self.c_loss)}, Actor Loss: {np.mean(self.a_loss)*2}' ) self.c_loss, self.a_loss = [], [] self.episode_timesteps += 1
class SAC(BaseAgent): def __init__(self, env, buffer_size=int(1e6), gamma=0.99, tau=0.005, lr=1e-3, start_timesteps=1000, actor_train_freq=2, batch_size=128, init_temperature=0.1, device=None): super(SAC, self).__init__(env, device) self.actor = SquashedGaussianActor(self.obs_dim, self.act_dim, self.act_limit).to(self.device) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr) self.critic = DoubleQvalueCritic(self.obs_dim, self.act_dim).to(self.device) self.critic_target = copy.deepcopy(self.critic) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=lr) # Adjustable alpha self.log_alpha = torch.tensor(np.log(init_temperature), requires_grad=True, device=self.device) self.target_entropy = -torch.prod( torch.Tensor(self.env.action_space.shape).to(self.device)).item() self.alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=1e-4, betas=(0.5, 0.999)) self.replay_buffer = ReplayBuffer(buffer_size) self.start_timesteps = start_timesteps self.tau = tau self.gamma = gamma self.alpha = self.log_alpha.exp() self.actor_train_freq = actor_train_freq self.batch_size = batch_size def offline_initialize(self, replay_buffer, epoch=1): conf = 2 # PPO-style mini-batch training critic_losses, actor_losses = [], [] idxes = np.arange(replay_buffer.size - 1) print(replay_buffer.size) for i in range(epoch): np.random.shuffle(idxes) for j in range(replay_buffer.size // self.batch_size): idx = idxes[i * self.batch_size:(i + 1) * self.batch_size] obs, action, reward, next_obs, done, next_action = replay_buffer.sample( self.batch_size, True, idx) # SARSA-style policy evaluation #with torch.no_grad(): # # Compute the target Q value # target_Q1, target_Q2 = self.critic_target(next_obs, next_action) # target_Q = torch.min(target_Q1, target_Q2) # target_Q = reward + (1 - done) * self.gamma * target_Q ## Get current Q estimates #current_Q1, current_Q2 = self.critic(obs, action) ## Compute critic loss #critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q) #critic_losses.append(critic_loss.item()) ## Optimize the critic #self.critic_optimizer.zero_grad() #critic_loss.backward() #self.critic_optimizer.step() # Behavior cloning under entropy-regularization _, logprob = self.actor(obs) _action = 0.5 * torch.log((1 + action) / (1 - action)) #actor_loss = (self.alpha * logprob - self.actor.logprob(obs, _action)).mean() actor_loss = -self.actor.logprob(obs, _action).mean() #print(action, _action) actor_losses.append(actor_loss.item()) self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() #alpha_loss = (self.log_alpha * (-logprob - self.target_entropy).detach()).mean() #self.alpha_optimizer.zero_grad() #alpha_loss.backward() #self.alpha_optimizer.step() #self.alpha = self.log_alpha.exp() for param, target_param in zip( self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) print( f'Epoch {i} Critic Loss: {np.mean(critic_losses)}, Actor Loss: {np.mean(actor_losses)}' ) critic_losses, actor_losses = [], [] # Approximate support with the learn policy self.lower_bound = np.zeros((replay_buffer.size, self.act_dim)) self.upper_bound = np.zeros((replay_buffer.size, self.act_dim)) idxes = np.arange(replay_buffer.size) for _ in range(epoch): for i in range(int(np.ceil(replay_buffer.size / self.batch_size))): idx = idxes[i * self.batch_size:(i + 1) * self.batch_size] obs, action, reward, next_obs, done = replay_buffer.sample( self.batch_size, with_idxes=idx) mu, std = self.actor.mu_std(obs) self.lower_bound[i * self.batch_size:(i + 1) * self.batch_size] = mu - conf * std self.upper_bound[i * self.batch_size:(i + 1) * self.batch_size] = mu + conf * std def offline_improve(self, replay_buffer, epoch=10): self.actor = SquashedGaussianActor(self.obs_dim, self.act_dim, self.act_limit).to(self.device) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=1e-3) self.actor_target = copy.deepcopy(self.actor) # Adjustable alpha self.log_alpha = torch.tensor(np.log(0.1), requires_grad=True, device=self.device) self.target_entropy = -torch.prod( torch.Tensor(self.env.action_space.shape).to(self.device)).item() self.alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=1e-4, betas=(0.5, 0.999)) actor_losses, critic_losses = [], [] idxes = np.arange(replay_buffer.size - 1) for i in range(epoch): np.random.shuffle(idxes) for j in range(replay_buffer.size // self.batch_size): idx = idxes[i * self.batch_size:(i + 1) * self.batch_size] obs, action, reward, next_obs, done = replay_buffer.sample( self.batch_size, with_idxes=idx) if j % 100 == 0: self.evaluate(self.env) # SARSA-style policy evaluation with torch.no_grad(): # No constrain #next_action, logprob = self.actor(next_obs) #target_Q1, target_Q2 = self.critic_target(next_obs, next_action) #target_Q = torch.min(target_Q1, target_Q2) #target_Q = reward + (1 - done) * self.gamma * (target_Q - self.alpha * logprob) # Probablistically constrain #mu, std = self.actor.mu_std(next_obs) #a, b = (self.lower_bound[idx+1] - mu)/std, (self.upper_bound[idx+1] - mu)/std #dist = truncnorm(a, b, loc=mu, scale=std) #next_action = torch.tensor(dist.rvs(), dtype=torch.float32, device=self.device) #logprob = self.actor.logprob(next_obs, next_action) #target_Q1, target_Q2 = self.critic_target(next_obs, torch.tanh(next_action)) #target_Q = torch.min(target_Q1, target_Q2) #target_Q = reward + (1 - done) * self.gamma * (target_Q - self.alpha * logprob) # Q-learning constrain mu, std = self.actor_target.mu_std(next_obs) next_action = mu #np.clip(mu, self.lower_bound[idx+1], self.upper_bound[idx+1]) next_action = torch.tensor(self.act_limit * np.tanh(next_action), dtype=torch.float32, device=self.device) target_Q1, target_Q2 = self.critic_target( next_obs, next_action) target_Q = torch.min(target_Q1, target_Q2) target_Q = reward + ( 1 - done ) * self.gamma * target_Q #(target_Q - self.alpha * logprob) # Get current Q estimates current_Q1, current_Q2 = self.critic(obs, action) # Compute critic loss critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss( current_Q2, target_Q) critic_losses.append(critic_loss.item()) # Optimize the critic self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # Behavior cloning under entropy-regularization # TODO: freeze critic parameters here to prevent unnecessary backpropagation for param in self.critic.parameters(): param.requires_grad = False cur_action, _ = self.actor.mu_std(obs, False) cur_action = torch.tanh(cur_action) current_Q1, current_Q2 = self.critic(obs, cur_action) current_Q = torch.min(current_Q1, current_Q2) actor_loss = -current_Q.mean() #cur_action, logprob = self.actor(obs, detach=True) #current_Q1, current_Q2 = self.critic(obs, cur_action) #current_Q = torch.min(current_Q1, current_Q2) #actor_std_loss = (self.alpha * logprob - current_Q).mean() #actor_loss = actor_std_loss + actor_mu_loss #cur_action, logprob = self.actor(obs, detach=True) #current_Q1, current_Q2 = self.critic(obs, cur_action) #current_Q = torch.min(current_Q1, current_Q2) #actor_loss = (self.alpha * logprob - current_Q).mean() self.actor_optimizer.zero_grad() actor_loss.backward() actor_losses.append(-current_Q.mean().item()) self.actor_optimizer.step() for param in self.critic.parameters(): param.requires_grad = True #alpha_loss = (self.log_alpha * (-logprob - 3 * self.target_entropy).detach()).mean() #self.alpha_optimizer.zero_grad() #alpha_loss.backward() #self.alpha_optimizer.step() #self.alpha = self.log_alpha.exp() for param, target_param in zip( self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) print( f'Epoch {i} Critic Loss: {np.mean(critic_losses)}, Actor Loss: {np.mean(actor_losses)}' ) critic_losses, actor_losses = [], [] def train(self, obs, action, next_obs, reward, done): with torch.no_grad(): next_action, logprob = self.actor(next_obs) # Compute the target Q value target_Q1, target_Q2 = self.critic_target(next_obs, next_action) target_Q = torch.min(target_Q1, target_Q2) target_Q = reward + (1 - done) * self.gamma * ( target_Q - self.alpha * logprob) # Get current Q estimates current_Q1, current_Q2 = self.critic(obs, action) # Compute critic loss critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss( current_Q2, target_Q) # Optimize the critic self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() cur_action, logprob = self.actor(obs) # TODO: freeze critic parameters here to prevent unnecessary backpropagation for param in self.critic.parameters(): param.requires_grad = False current_Q1, current_Q2 = self.critic(obs, cur_action) current_Q = torch.min(current_Q1, current_Q2) actor_loss = (self.alpha * logprob - current_Q).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() for param in self.critic.parameters(): param.requires_grad = True alpha_loss = (self.log_alpha * (-logprob - self.target_entropy).detach()).mean() self.alpha_optimizer.zero_grad() alpha_loss.backward() self.alpha_optimizer.step() self.alpha = self.log_alpha.exp() for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) def act(self, obs): obs = torch.tensor(obs, dtype=torch.float32, device=self.device) obs = obs.reshape(1, -1) return self.actor.act(obs, deterministic=True) def step(self, t): self.episode_timesteps += 1 # Select action randomly or according to policy #if t < self.start_timesteps:# or t > self.start_timesteps: # action = self.env.action_space.sample() #else: # action = self.actor.act(torch.tensor(self.obs, dtype=torch.float32, device=self.device)) action = self.actor.act( torch.tensor(self.obs, dtype=torch.float32, device=self.device)) # Perform action next_obs, reward, done, _ = self.env.step(action) #done_bool = float(done) if self.episode_timesteps < self.env._max_episode_steps else 0 done_bool = float( done ) # if self.episode_timesteps < self.env._max_episode_steps else 0 # Store data in replay buffer self.replay_buffer.add(copy.deepcopy(self.obs), action, next_obs, reward, done_bool) self.obs = next_obs self.episode_reward += reward # Train agent after collecting sufficient data, extra training iterations added when first reached start_timesteps if t == self.start_timesteps: for _ in range(self.start_timesteps): batch = self.replay_buffer.sample(self.batch_size) self.train(*batch) elif t > self.start_timesteps: batch = self.replay_buffer.sample(self.batch_size) self.train(*batch) if done: self.episode_end_handle(t)