Esempio n. 1
0
class Random(BaseAgent):
    def __init__(self, env, buffer_size=int(1e6), device=None):

        super(Random, self).__init__(env, device)
        self.replay_buffer = ReplayBuffer(self.obs_dim, self.act_dim,
                                          buffer_size)

    def act(self, obs):
        return self.env.action_space.sample()

    def step(self, t):

        self.episode_timesteps += 1

        # Select action randomly or according to policy
        action = self.env.action_space.sample()

        # Perform action
        next_obs, reward, done, _ = self.env.step(action)
        done_bool = float(
            done
        )  # if self.episode_timesteps < self.env._max_episode_steps else 0
        # Store data in replay buffer
        self.replay_buffer.add(copy.deepcopy(self.obs), action, next_obs,
                               reward, done_bool)
        self.obs = next_obs
        self.episode_reward += reward
        # Train agent after collecting sufficient data
        if done:
            self.episode_end_handle(t)
Esempio n. 2
0
    def train_new_agent(self, replay_buffer: ReplayBuffer,
                        level: int) -> SacActor:
        assert level == 1 or level == 2
        new_agent = copy.deepcopy(self.level_2_policy if level ==
                                  2 else self.level_1_policy)

        batch_size = 32
        optimizer = Adam(new_agent.parameters())
        loss_fn = MSELoss()
        # Go through the data 4 times
        for i in range(replay_buffer.size() // batch_size * 4):
            if level == 2:
                states, desired_goals = replay_buffer.get_batch(batch_size)
                outputted_goals, _ = new_agent(states, goal=None)
                loss = loss_fn(outputted_goals, desired_goals)
            else:  # Level 1
                states, goals, desired_actions = replay_buffer.get_batch(
                    batch_size)
                outputted_actions, _ = new_agent(states, goals)
                loss = loss_fn(outputted_actions, desired_actions)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        return new_agent
Esempio n. 3
0
    def __init__(self, state_size: int, goal_size: int, action_low: np.ndarray, action_high: np.ndarray,
                 q_bound_low: float, q_bound_high: float,
                 buffer_size: int, batch_size: int, writer, sac_id: Optional[str],
                 use_priority_replay: bool, learning_rate: float, initial_alpha: float):
        super().__init__()
        self.action_size = len(action_low)
        self.use_priority_replay = use_priority_replay

        self.critic1 = SacCritic(state_size, goal_size, self.action_size, q_bound_low, q_bound_high)
        self.critic1_target = copy.deepcopy(self.critic1)
        self.critic2 = SacCritic(state_size, goal_size, self.action_size, q_bound_low, q_bound_high)
        self.critic2_target = copy.deepcopy(self.critic2)

        self.actor = SacActor(state_size, goal_size, self.action_size, action_low=action_low, action_high=action_high)
        self.actor_target = copy.deepcopy(self.actor)

        initial_log_alpha = math.log(initial_alpha)
        self.alpha = initial_alpha
        self.log_alpha = torch.tensor([initial_log_alpha], requires_grad=True)
        self.target_entropy = -np.prod(action_low.shape).item()  # Use heuristic value from SAC paper

        self.critic_optimizer = Adam(list(self.critic1.parameters()) + list(self.critic2.parameters()), lr=learning_rate)
        self.actor_optimizer = Adam(self.actor.parameters(), lr=learning_rate)
        self.alpha_optimizer = Adam([self.log_alpha], lr=learning_rate)

        # Optimization for speed: don't compute gradients for the target networks, since we will never use them
        for network in [self.actor_target, self.critic1_target, self.critic2_target]:
            for parameter in network.parameters():
                parameter.requires_grad = False

        self.polyak = 0.995


        # 8 transitions dims: (current_state, action, env_reward, total_reward, next_state, transition_reward, current_goal, discount)
        # NOTE: they use some more complicated logic (which depends on the level) to determinate the size of the buffer
        # TODO: this is a simplfication. See if it works anyway.
        # self.buffer = PrioritizedReplayBuffer(buffer_size, num_transition_dims=8)

        if use_priority_replay:
            self.buffer = PrioritizedReplayBuffer(buffer_size, num_transition_dims=8)
        else:
            self.buffer = ReplayBuffer(buffer_size, num_transition_dims=8)

        self.batch_size = batch_size
        self.q_bound_low = q_bound_low
        self.q_bound_high = q_bound_high

        self.step_number = 0
        self.use_tensorboard = (writer is not None)
        self.writer = writer
        self.sac_id = sac_id
Esempio n. 4
0
    def __init__(self,
                 env,
                 lr=1e-3,
                 gamma=0.99,
                 tau=0.005,
                 buffer_size=int(1e6),
                 start_timesteps=5000,
                 expl_noise=0.1,
                 batch_size=128,
                 policy_noise=0.2,
                 noise_clip=0.5,
                 policy_freq=2,
                 device=None,
                 **kwargs):

        super(Custom, self).__init__(env, device)

        self.actor = GaussianActor(self.obs_dim, self.act_dim, self.act_limit,
                                   **kwargs).to(self.device)
        self.actor_target = copy.deepcopy(self.actor)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr)

        self.behavior = GaussianActor(self.obs_dim, self.act_dim,
                                      self.act_limit, **kwargs).to(self.device)
        self.behavior_optimizer = torch.optim.Adam(self.behavior.parameters(),
                                                   lr=lr)

        self.critic = DoubleQvalueCritic(self.obs_dim, self.act_dim,
                                         **kwargs).to(self.device)
        self.critic_target = copy.deepcopy(self.critic)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=lr)

        self.replay_buffer = ReplayBuffer(self.obs_dim, self.act_dim,
                                          buffer_size)

        self.start_timesteps = start_timesteps
        self.expl_noise = expl_noise
        self.batch_size = batch_size
        self.lr = lr
        self.gamma = gamma
        self.tau = tau
        self.policy_noise = policy_noise
        self.noise_clip = noise_clip
        self.policy_freq = policy_freq
        self.total_it = 0

        self.c_loss, self.a_loss = [], []
Esempio n. 5
0
def main():
    """메인."""
    # 환경 생성
    env = make_env(ENV_NAME)
    net = DQN(env.observation_space.shape, env.action_space.n)
    net.apply(weights_init)
    tgt_net = DQN(env.observation_space.shape, env.action_space.n)
    tgt_net.load_state_dict(net.state_dict())

    if PRIORITIZED:
        memory = PrioReplayBuffer(PRIO_BUF_SIZE)
    else:
        memory = ReplayBuffer(SEND_SIZE)

    # 고정 eps로 에이전트 생성
    epsilon = EPS_BASE**(1 + actor_id / (num_actor - 1) * EPS_ALPHA)
    agent = Agent(env, memory, epsilon, PRIORITIZED)
    log("Actor {} - epsilon {:.5f}".format(actor_id, epsilon))

    # zmq 초기화
    context, lrn_sock, buf_sock = init_zmq()
    # 러너에게서 기본 가중치 받고 시작
    net, tgt_net = receive_model(lrn_sock, net, tgt_net, True)

    #
    # 시뮬레이션
    #
    episode = frame_idx = 0
    p_time = p_frame = None
    p_reward = -50.0

    while True:
        frame_idx += 1

        # 스텝 진행 (에피소드 종료면 reset까지)
        reward = agent.play_step(net, tgt_net, epsilon, frame_idx)

        # 리워드가 있는 경우 (에피소드 종료)
        if reward is not None:
            episode += 1
            p_reward = reward

        # 보내기
        if frame_idx % SEND_FREQ == 0:
            # 학습관련 정보
            if p_time is None:
                speed = 0.0
            else:
                speed = (frame_idx - p_frame) / (time.time() - p_time)
            info = ActorInfo(episode, frame_idx, p_reward, speed)
            # 리플레이 정보와 정보 전송
            agent.send_replay(buf_sock, info)
            # 동작 선택 횟수
            agent.show_action_rate()

            p_time = time.time()
            p_frame = frame_idx

            # 새로운 모델 받기
            net, tgt_net = receive_model(lrn_sock, net, tgt_net, False)
Esempio n. 6
0
def main():
    """메인."""
    # 환경 생성
    env = make_env(ENV_NAME)
    set_random_seed(env, actor_id)
    net = A2C(env.observation_space.shape, env.action_space.n)
    net.apply(weights_init)
    memory = ReplayBuffer(SEND_SIZE)
    agent = Agent(env, memory, NUM_UNROLL)
    log("Actor {}".format(actor_id))

    # zmq 초기화
    context, lrn_sock, buf_sock = init_zmq()
    # 러너에게서 기본 가중치 받고 시작
    net = receive_model(lrn_sock, net, True)

    #
    # 시뮬레이션
    #
    episode = frame_idx = 0
    p_time = p_frame = None
    p_reward = -50.0

    while True:
        frame_idx += 1

        # 스텝 진행 (에피소드 종료면 reset까지)
        ep_reward = agent.play_step(net, frame_idx)

        # 에피소드 리워드가 있는 경우 (에피소드 종료)
        if ep_reward is not None:
            episode += 1
            p_reward = ep_reward
            log("Episode finished! reward {}".format(ep_reward))

        # 보내기
        if frame_idx % SEND_FREQ == 0:
            # 학습관련 정보
            if p_time is None:
                speed = 0.0
            else:
                speed = (frame_idx - p_frame) / (time.time() - p_time)
            info = ActorInfo(episode, frame_idx, p_reward, speed)
            # 리플레이 정보와 정보 전송
            agent.send_replay(buf_sock, info)
            # 동작 선택 횟수
            agent.show_action_rate()

            p_time = time.time()
            p_frame = frame_idx

            # 새로운 모델 받기
            net = receive_model(lrn_sock, net, False)
Esempio n. 7
0
    def __init__(self,
                 env,
                 buffer_size=int(1e6),
                 gamma=0.99,
                 tau=0.005,
                 lr=1e-3,
                 start_timesteps=1000,
                 actor_train_freq=2,
                 batch_size=128,
                 init_temperature=0.1,
                 device=None):

        super(SAC, self).__init__(env, device)
        self.actor = SquashedGaussianActor(self.obs_dim, self.act_dim,
                                           self.act_limit).to(self.device)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr)

        self.critic = DoubleQvalueCritic(self.obs_dim,
                                         self.act_dim).to(self.device)
        self.critic_target = copy.deepcopy(self.critic)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=lr)

        # Adjustable alpha
        self.log_alpha = torch.tensor(np.log(init_temperature),
                                      requires_grad=True,
                                      device=self.device)
        self.target_entropy = -torch.prod(
            torch.Tensor(self.env.action_space.shape).to(self.device)).item()
        self.alpha_optimizer = torch.optim.Adam([self.log_alpha],
                                                lr=1e-4,
                                                betas=(0.5, 0.999))

        self.replay_buffer = ReplayBuffer(buffer_size)
        self.start_timesteps = start_timesteps
        self.tau = tau
        self.gamma = gamma
        self.alpha = self.log_alpha.exp()
        self.actor_train_freq = actor_train_freq
        self.batch_size = batch_size
Esempio n. 8
0
	def __init__(self, env, lr=1e-3, gamma=0.99, tau=0.005, buffer_size=int(1e6),
	             start_timesteps=1000, expl_noise=0.1, batch_size=256,
				 device=None):

		super(DDPG, self).__init__(env, device)

		self.actor = DeterministicActor(self.obs_dim, self.act_dim, self.act_limit).to(self.device)
		self.actor_target = copy.deepcopy(self.actor)
		self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr)

		self.critic = QvalueCritic(self.obs_dim, self.act_dim).to(self.device)
		self.critic_target = copy.deepcopy(self.critic)
		self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=lr)

		self.replay_buffer = ReplayBuffer(self.obs_dim, self.act_dim, buffer_size)

		self.start_timesteps = start_timesteps
		self.expl_noise = expl_noise
		self.batch_size = batch_size
		self.lr = lr
		self.gamma = gamma
		self.tau = tau
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Keep track of time step
        self.t = 0

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memoryimport matplotlib.pyplot as pltA
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)
Esempio n. 10
0
    def __init__(self, state_size: int, goal_size: int,
                 action_range: np.ndarray, action_center: np.ndarray,
                 q_bound: float, buffer_size: int, batch_size: int):
        super().__init__()
        action_size = len(action_range)

        # Important: there are no target networks on purpose, because the HAC paper
        # found they were not very useful
        self.critic = Critic(state_size, goal_size, action_size, q_bound)
        self.actor = Actor(state_size, goal_size, action_size, action_range,
                           action_center)

        # https://github.com/andrew-j-levy/Hierarchical-Actor-Critc-HAC-/blob/master/critic.py#L8
        self.critic_optimizer = Adam(self.critic.parameters(), lr=0.001)
        # https://github.com/andrew-j-levy/Hierarchical-Actor-Critc-HAC-/blob/master/actor.py#L15
        self.actor_optimizer = Adam(self.actor.parameters(), lr=0.001)

        # There's 6 dimensions in a transition: (current_state, action, penalty, next_state, current_goal, discount)
        # NOTE: they use some more complicated logic (which depends on the level) to determinate the size of the buffer
        # TODO: this is a simplfication. See if it works anyway.
        self.buffer = ReplayBuffer(buffer_size, num_transition_dims=6)
        self.batch_size = batch_size
        self.q_bound = q_bound
Esempio n. 11
0
    def teach_hrl_agent(self) -> Tuple[SacActor, SacActor]:
        current_agent_1 = self.level_1_policy
        current_agent_2 = self.level_2_policy
        replay_buffer_1 = ReplayBuffer(max_size=2_000_000,
                                       num_transition_dims=3)
        replay_buffer_2 = ReplayBuffer(max_size=2_000_000,
                                       num_transition_dims=2)
        for i in range(self.num_agents_taught):
            print(f"DAgger-Hierarchical: training step {i}")
            with torch.no_grad():
                new_experiences = []
                for _ in tqdm(range(self.num_trajectories)):
                    done = False
                    state = self.env.reset()

                    while not done:
                        if random.random() < self.probability_use_level_1:
                            goal, logprob = self.level_2_policy.sample_actions(
                                state, goal=None, compute_log_prob=True)
                            end_state, done = self.rollout(state, goal)
                        else:
                            num_steps = random.randint(
                                int(0.75 * self.horizon_length),
                                self.horizon_length)
                            level_1_transitions, end_state, done = self.expert_rollout(
                                state, num_steps)

                            new_experiences.append((state, end_state))
                            replay_buffer_1.add_many(level_1_transitions)

                        state = end_state

                replay_buffer_2.add_many(new_experiences)

            current_agent_1 = self.train_new_agent(replay_buffer_1, level=1)
            current_agent_2 = self.train_new_agent(replay_buffer_2, level=2)

            self.evaluate_agent(current_agent_1,
                                current_agent_2,
                                num_episodes_to_render=2)

        return current_agent_1, current_agent_2
Esempio n. 12
0
class SacEntropyAdjustment(nn.Module):
    def __init__(self, state_size: int, goal_size: int, action_low: np.ndarray, action_high: np.ndarray,
                 q_bound_low: float, q_bound_high: float,
                 buffer_size: int, batch_size: int, writer, sac_id: Optional[str],
                 use_priority_replay: bool, learning_rate: float, initial_alpha: float):
        super().__init__()
        self.action_size = len(action_low)
        self.use_priority_replay = use_priority_replay

        self.critic1 = SacCritic(state_size, goal_size, self.action_size, q_bound_low, q_bound_high)
        self.critic1_target = copy.deepcopy(self.critic1)
        self.critic2 = SacCritic(state_size, goal_size, self.action_size, q_bound_low, q_bound_high)
        self.critic2_target = copy.deepcopy(self.critic2)

        self.actor = SacActor(state_size, goal_size, self.action_size, action_low=action_low, action_high=action_high)
        self.actor_target = copy.deepcopy(self.actor)

        initial_log_alpha = math.log(initial_alpha)
        self.alpha = initial_alpha
        self.log_alpha = torch.tensor([initial_log_alpha], requires_grad=True)
        self.target_entropy = -np.prod(action_low.shape).item()  # Use heuristic value from SAC paper

        self.critic_optimizer = Adam(list(self.critic1.parameters()) + list(self.critic2.parameters()), lr=learning_rate)
        self.actor_optimizer = Adam(self.actor.parameters(), lr=learning_rate)
        self.alpha_optimizer = Adam([self.log_alpha], lr=learning_rate)

        # Optimization for speed: don't compute gradients for the target networks, since we will never use them
        for network in [self.actor_target, self.critic1_target, self.critic2_target]:
            for parameter in network.parameters():
                parameter.requires_grad = False

        self.polyak = 0.995


        # 8 transitions dims: (current_state, action, env_reward, total_reward, next_state, transition_reward, current_goal, discount)
        # NOTE: they use some more complicated logic (which depends on the level) to determinate the size of the buffer
        # TODO: this is a simplfication. See if it works anyway.
        # self.buffer = PrioritizedReplayBuffer(buffer_size, num_transition_dims=8)

        if use_priority_replay:
            self.buffer = PrioritizedReplayBuffer(buffer_size, num_transition_dims=8)
        else:
            self.buffer = ReplayBuffer(buffer_size, num_transition_dims=8)

        self.batch_size = batch_size
        self.q_bound_low = q_bound_low
        self.q_bound_high = q_bound_high

        self.step_number = 0
        self.use_tensorboard = (writer is not None)
        self.writer = writer
        self.sac_id = sac_id

    def get_error(self, transition: tuple) -> float:
        state, action, _, _, next_state, reward, goal, discount = [permissive_get_tensor(x) for x in transition]
        target_q_values, values1, values2 = self.get_target_q_values(reward, discount, next_state, goal)
        predicted_q_values1 = self.critic1.forward(state, goal, action)
        predicted_q_values2 = self.critic2.forward(state, goal, action)

        return self.get_td_error(predicted_q_values1, predicted_q_values2, target_q_values).item()

    def get_td_error(self, predicted_q_values1: torch.Tensor, predicted_q_values2: torch.Tensor, target_q_values: torch.Tensor) -> torch.Tensor:
        return (target_q_values - predicted_q_values1).abs() + (target_q_values - predicted_q_values2).abs()

    def add_to_buffer(self, transition: tuple):
        assert len(transition[1]) == self.action_size
        if self.use_priority_replay:
            # noinspection PyArgumentList
            self.buffer.add(error=self.get_error(transition), transition=transition)
        else:
            self.buffer.add(transition)

    def add_many_to_buffer(self, transitions: List[tuple]):
        for transition in transitions:
            self.add_to_buffer(transition)

    def sample_action(self, state: np.ndarray, goal: np.ndarray, deterministic=False) -> np.ndarray:
        with torch.no_grad():
            return self.actor.sample_actions(state, goal, deterministic, compute_log_prob=False)

    def learn(self, num_updates: int):
        # If there's not enough transitions to fill a batch, we don't do anything
        if self.buffer.size() < self.batch_size:
            return

        for i in range(num_updates):
            # Step 0: get the transitions and the next actions for the next state
            states, actions, env_rewards, total_env_rewards, next_states, rewards, goals, discounts = self.buffer.get_batch(self.batch_size)

            # Step 1: Update the log_alpha and alpha
            self.alpha_optimizer.zero_grad()
            actions_states, log_actions_states = self.actor(states, goals)
            alpha_loss = -(self.log_alpha * (log_actions_states + self.target_entropy).detach()).mean()
            self.alpha = self.log_alpha.exp()
            alpha_loss.backward()
            self.alpha_optimizer.step()

            # Step 2: improve the critic
            self.critic_optimizer.zero_grad()

            target_q_values, values1, values2 = self.get_target_q_values(rewards, discounts, next_states, goals)
            predicted_q_values1 = self.critic1(states, goals, actions)
            predicted_q_values2 = self.critic2(states, goals, actions)

            # Update priority in Priority Replay Buffer if needed
            if self.use_priority_replay:
                errors = self.get_td_error(predicted_q_values1, predicted_q_values2, target_q_values)
                for j in range(self.batch_size):
                    index = self.buffer.last_indices[j]
                    self.buffer.update(index, errors[j].item())

            # critic_loss = F.smooth_l1_loss(predicted_q_values1, target_q_values) + \
            #               F.smooth_l1_loss(predicted_q_values2, target_q_values)

            critic_loss = ((predicted_q_values1 - target_q_values) ** 2).mean() + \
                          ((predicted_q_values2 - target_q_values) ** 2).mean()

            critic_loss.backward()
            self.critic_optimizer.step()

            # Step 3: improve the actor
            # Freeze Q-network so you don't waste computational effort
            # computing gradients for it during the policy learning step.
            # TODO: for some reason, if I do this, then I get this error when I do actor_loss.backward()
            # "RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn"
            # This does not happen in my other DDPG code and I don't know why.
            # TODO: figure it out
            # for p in self.critic.parameters():
            #     p.requires_grad = False
            self.actor_optimizer.zero_grad()

            # We want to maximize the q-values of the actions (and therefore minimize -Q_values)
            new_actions, log_new_actions = self.actor(states, goals)
            values1 = self.critic1(states, goals, new_actions)
            values2 = self.critic2(states, goals, new_actions)
            actor_loss = (self.alpha * log_new_actions - torch.min(values1, values2)).mean()

            actor_loss.backward()
            self.actor_optimizer.step()

            # Log things on tensorboard and console if needed
            if self.use_tensorboard and i == 0:
                self.writer.add_scalar(f"Loss/({self.sac_id}) Policy", actor_loss.item(), self.step_number)
                self.writer.add_scalar(f"Loss/({self.sac_id}) Value", critic_loss.item(), self.step_number)
                self.writer.add_scalar(f"Loss/({self.sac_id}) Log Prob", log_new_actions[0].item(), self.step_number)
                self.writer.add_scalar(f"Loss/({self.sac_id}) Target", target_q_values[0].item(), self.step_number)
                self.writer.add_scalar(f"Loss/({self.sac_id}) Predicted 1", predicted_q_values1[0].item(), self.step_number)
                self.writer.add_scalar(f"Loss/({self.sac_id}) Values 1", values2[0].item(), self.step_number)
                self.writer.add_scalar(f"Loss/({self.sac_id}) Predicted 2", predicted_q_values2[0].item(), self.step_number)
                self.writer.add_scalar(f"Loss/({self.sac_id}) Values 2", values1[0].item(), self.step_number)
                self.writer.add_scalar(f"Loss/({self.sac_id}) Reward", rewards[0].item(), self.step_number)

            # Unfreeze Q-network so you can optimize it at next DDPG step.
            # for p in self.critic.parameters():
            #     p.requires_grad = True

            polyak_average(self.actor, self.actor_target, self.polyak)
            polyak_average(self.critic1, self.critic1_target, self.polyak)
            polyak_average(self.critic2, self.critic2_target, self.polyak)

            self.step_number += 1

    def get_target_q_values(self, rewards: torch.Tensor, discounts: torch.Tensor, next_states: torch.Tensor, goals: torch.Tensor):
        with torch.no_grad():  # No need to compute gradients for this
            # The actions for the next state come from **current** policy (not from the target policy)
            next_actions, log_next_actions = self.actor(next_states, goals)

            values1 = self.critic1_target(next_states, goals, next_actions)
            values2 = self.critic2_target(next_states, goals, next_actions)
            values_next_state = torch.min(values1, values2).squeeze()
            target_q_values = rewards + discounts * (values_next_state - self.alpha * log_next_actions)
            if target_q_values.ndim != 0:
                target_q_values = target_q_values.unsqueeze(1)
            # We clamp the Q-values to be in [-H, 0] if we're not at the top level. Why would this be needed given that the critic already
            # outputs values in this range? Well, it's true, the critic does do that, but the target
            # expression is r + alpha * Q(s', a') and that might go outside of [-H, 0]. We don't want
            # that to happen, so we clamp it to the range. This will thus incentivize the critic to predict
            # values in [-H, 0], but since the critic can already only output values in that range, it's perfect.
            # Of course, this is not a coincidence but done by design.
            if self.q_bound_low is not None:  # It's None for the top-level, since we don't know in advance the total reward range
                target_q_values = torch.clamp(target_q_values, min=self.q_bound_low, max=self.q_bound_high)

            return target_q_values, values1, values2
Esempio n. 13
0
class DDPG(nn.Module):
    def __init__(self, state_size: int, goal_size: int,
                 action_range: np.ndarray, action_center: np.ndarray,
                 q_bound: float, buffer_size: int, batch_size: int):
        super().__init__()
        action_size = len(action_range)

        # Important: there are no target networks on purpose, because the HAC paper
        # found they were not very useful
        self.critic = Critic(state_size, goal_size, action_size, q_bound)
        self.actor = Actor(state_size, goal_size, action_size, action_range,
                           action_center)

        # https://github.com/andrew-j-levy/Hierarchical-Actor-Critc-HAC-/blob/master/critic.py#L8
        self.critic_optimizer = Adam(self.critic.parameters(), lr=0.001)
        # https://github.com/andrew-j-levy/Hierarchical-Actor-Critc-HAC-/blob/master/actor.py#L15
        self.actor_optimizer = Adam(self.actor.parameters(), lr=0.001)

        # There's 6 dimensions in a transition: (current_state, action, penalty, next_state, current_goal, discount)
        # NOTE: they use some more complicated logic (which depends on the level) to determinate the size of the buffer
        # TODO: this is a simplfication. See if it works anyway.
        self.buffer = ReplayBuffer(buffer_size, num_transition_dims=6)
        self.batch_size = batch_size
        self.q_bound = q_bound

    def add_to_buffer(self, transition: tuple):
        self.buffer.add(transition)

    def add_many_to_buffer(self, transitions: List[tuple]):
        self.buffer.add_many(transitions)

    def sample_action(self, state: np.ndarray, goal: np.ndarray):
        with torch.no_grad():
            return self.actor(state, goal).numpy()

    def learn(self, num_updates: int):
        # If there's not enough transitions to fill a batch, we don't do anything
        if self.buffer.size() < self.batch_size:
            return

        for i in range(num_updates):
            # Step 1: get the transitions and the next actions for the next state
            states, actions, rewards, next_states, goals, discounts = self.buffer.get_batch(
                self.batch_size)
            next_actions = self.actor(next_states, goals)

            # Step 2: improve the critic
            with torch.no_grad():  # No need to compute gradients for this
                target_q_values = rewards + discounts * self.critic(
                    next_states, goals, next_actions).squeeze()
                target_q_values = target_q_values.unsqueeze(1)
                # We clamp the Q-values to be in [-H, 0]. Why would this be needed given that the critic already
                # outputs values in this range? Well, it's true, the critic does do that, but the target
                # expression is r + alpha * Q(s', a') and that might go outside of [-H, 0]. We don't want
                # that to happen, so we clamp it to the range. This will thus incentivize the critic to predict
                # values in [-H, 0], but since the critic can already only output values in that range, it's perfect.
                # Of course, this is not a coincidence but done by design.
                target_q_values = torch.clamp(target_q_values,
                                              min=self.q_bound,
                                              max=0.0)

            self.critic_optimizer.zero_grad()
            predicted_q_values = self.critic(states, goals, actions)
            critic_loss = F.mse_loss(predicted_q_values, target_q_values)
            critic_loss.backward()
            self.critic_optimizer.step()

            # Step 3: improve the actor
            # Freeze Q-network so you don't waste computational effort
            # computing gradients for it during the policy learning step.
            # TODO: for some reason, if I do this, then I get this error when I do actor_loss.backward()
            # "RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn"
            # This does not happen in my other DDPG code and I don't know why.
            # TODO: figure it out
            # for p in self.critic.parameters():
            #     p.requires_grad = False

            # We want to maximize the q-values of the actions (and therefore minimize -Q_values)
            self.actor_optimizer.zero_grad()
            new_actions = self.actor(states, goals)
            actor_loss = -self.critic(states, goals, new_actions).mean()
            actor_loss.backward()
            self.actor_optimizer.step()
Esempio n. 14
0
class DDPG(BaseAgent):
	def __init__(self, env, lr=1e-3, gamma=0.99, tau=0.005, buffer_size=int(1e6),
	             start_timesteps=1000, expl_noise=0.1, batch_size=256,
				 device=None):

		super(DDPG, self).__init__(env, device)

		self.actor = DeterministicActor(self.obs_dim, self.act_dim, self.act_limit).to(self.device)
		self.actor_target = copy.deepcopy(self.actor)
		self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr)

		self.critic = QvalueCritic(self.obs_dim, self.act_dim).to(self.device)
		self.critic_target = copy.deepcopy(self.critic)
		self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=lr)

		self.replay_buffer = ReplayBuffer(self.obs_dim, self.act_dim, buffer_size)

		self.start_timesteps = start_timesteps
		self.expl_noise = expl_noise
		self.batch_size = batch_size
		self.lr = lr
		self.gamma = gamma
		self.tau = tau

	def act(self, obs):
		obs = torch.tensor(obs, dtype=torch.float32, device=self.device)
		return self.actor(obs).cpu().data.numpy().flatten()

	def train(self):

		obs, action, reward, next_obs, done = self.replay_buffer.sample(self.batch_size)

		# Compute the target Q value
		target_Q = self.critic_target(next_obs, self.actor_target(next_obs))
		target_Q = reward + (1 - done) * self.gamma * target_Q.detach()

		# Get current Q estimate
		current_Q = self.critic(obs, action)

		# Compute critic loss
		critic_loss = F.mse_loss(current_Q, target_Q)

		# Optimize the critic
		self.critic_optimizer.zero_grad()
		critic_loss.backward()
		self.critic_optimizer.step()

		# Compute actor loss
		actor_loss = -self.critic(obs, self.actor(obs)).mean()

		# Optimize the actor
		self.actor_optimizer.zero_grad()
		actor_loss.backward()
		self.actor_optimizer.step()

		# Update the frozen target models
		for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
			target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

		for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
			target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

	def step(self, t):

		self.episode_timesteps += 1

		# Select action randomly or according to policy
		if t < self.start_timesteps:
			action = self.env.action_space.sample()
		else:
			action = (
					self.actor.act(torch.tensor(self.obs, dtype=torch.float32, device=self.device))
					+ np.random.normal(0, self.act_limit * self.expl_noise, size=self.act_dim)
			).clip(-self.act_limit, self.act_limit)

		# Perform action
		next_obs, reward, done, _ = self.env.step(action)
		done_bool = float(done)# if self.episode_timesteps < self.env._max_episode_steps else 0
		# Store data in replay buffer
		self.replay_buffer.add(copy.deepcopy(self.obs), action, reward, next_obs, done_bool)
		self.obs = next_obs

		self.episode_reward += reward
		# Train agent after collecting sufficient data
		if t > self.start_timesteps:
			self.train()
		if done:
			self.episode_end_handle(t)
Esempio n. 15
0
class TD3(BaseAgent):
    def __init__(self,
                 env,
                 lr=3e-4,
                 gamma=0.99,
                 tau=0.005,
                 buffer_size=int(1e6),
                 start_timesteps=1000,
                 expl_noise=0.1,
                 batch_size=100,
                 policy_noise=0.2,
                 noise_clip=0.5,
                 policy_freq=2,
                 device=None,
                 **kwargs):

        super(TD3, self).__init__(env, device)

        self.actor = DeterministicActor(self.obs_dim, self.act_dim,
                                        self.act_limit,
                                        **kwargs).to(self.device)
        self.actor_target = copy.deepcopy(self.actor)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr)

        self.critic = DoubleQvalueCritic(self.obs_dim, self.act_dim,
                                         **kwargs).to(self.device)
        self.critic_target = copy.deepcopy(self.critic)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=lr)

        self.replay_buffer = ReplayBuffer(self.obs_dim, self.act_dim,
                                          buffer_size)

        self.start_timesteps = start_timesteps
        self.expl_noise = expl_noise
        self.batch_size = batch_size
        self.lr = lr
        self.gamma = gamma
        self.tau = tau
        self.policy_noise = policy_noise
        self.noise_clip = noise_clip
        self.policy_freq = policy_freq
        self.total_it = 0

    def act(self, obs):
        obs = torch.tensor(obs, dtype=torch.float32, device=self.device)
        return self.actor(obs).cpu().data.numpy().flatten()

    def train(self):

        obs, action, reward, next_obs, done = self.replay_buffer.sample(
            self.batch_size)

        self.total_it += 1

        cur_action = self.actor(obs)

        with torch.no_grad():
            # Select action according to policy and add clipped noise
            noise = (torch.randn_like(action) * self.policy_noise).clamp(
                -self.noise_clip, self.noise_clip)

            next_action = (self.actor_target(next_obs) + noise).clamp(
                -self.act_limit, self.act_limit)

            # Compute the target Q value
            target_Q1, target_Q2 = self.critic_target(next_obs, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + (1 - done) * self.gamma * target_Q

        # Get current Q estimates
        current_Q1, current_Q2 = self.critic(obs, action)

        # Compute critic loss
        critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(
            current_Q2, target_Q)

        # Optimize the critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Delayed policy updates
        if self.total_it % self.policy_freq == 0:

            for param in self.critic.parameters():
                param.requires_grad = False

            # Compute actor losse
            actor_loss = -self.critic.Q1(obs, cur_action).mean()
            #target = 1 / (2 * 0.2) * grad(self.critic.Q1(obs, cur_action).mean(), cur_action)[0].detach() + self.actor_target(obs).detach()
            #actor_loss = F.mse_loss(target, cur_action)

            # Optimize the actor
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            for param in self.critic.parameters():
                param.requires_grad = True

            # Update the frozen target models
            for param, target_param in zip(self.critic.parameters(),
                                           self.critic_target.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

            for param, target_param in zip(self.actor.parameters(),
                                           self.actor_target.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

    def step(self, t):

        self.episode_timesteps += 1

        # Select action randomly or according to policy
        if t < self.start_timesteps:
            action = self.env.action_space.sample()
        else:
            action = (self.actor.act(
                torch.tensor(self.obs, dtype=torch.float32,
                             device=self.device)) +
                      np.random.normal(0,
                                       self.act_limit * self.expl_noise,
                                       size=self.act_dim)).clip(
                                           -self.act_limit, self.act_limit)

        # Perform action
        next_obs, reward, done, _ = self.env.step(action)
        done_bool = float(
            done
        )  # if self.episode_timesteps < self.env._max_episode_steps else 0
        # Store data in replay buffer
        self.replay_buffer.add(copy.deepcopy(self.obs), action, reward,
                               next_obs, done_bool)
        self.obs = next_obs

        self.episode_reward += reward
        # Train agent after collecting sufficient data
        # TODO: extra training to compensate for inti_timesteps?
        if t > self.start_timesteps:
            self.train()

        if done:
            self.episode_end_handle(t)
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Keep track of time step
        self.t = 0

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memoryimport matplotlib.pyplot as pltA
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # update time step
        self.t += 1

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            if self.t % UPDATE_EVERY == 0:
                for _ in range(NUM_UPDATES):
                    experiences = self.memory.sample()
                    self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic lossimport matplotlib.pyplot as pltA
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Esempio n. 17
0
        masks = torch.tensor(done).float().reshape(-1, 1)

        expected = rewards + 0.99 * max_q_vals * masks
        loss = torch.mean(
            torch.pow(q_values - expected,
                      2))  #F.smooth_l1_loss(selected_q_values, expected)

        return loss


env = Env('CartPole-v0')
agent = DQN(env.sizes)

adam = optim.Adam(agent.parameters(), 1e-3)

memory = ReplayBuffer(1000)

epochs = 20000
batch_size = 32

max_eps = 1.
min_eps = 0.01
eps_decay = 8000

eps = lambda max_eps, min_eps, eps_decay, epoch: min_eps + (
    max_eps - min_eps) * np.exp(-1. * epoch / eps_decay)
recap = []

episode_mean_reward = 0

for episode in range(epochs):
Esempio n. 18
0
    """액터별로 정보 평균."""
    result = {}
    for ano, infos in ainfos.items():
        infos = ActorInfo(*zip(*infos))
        tmp = ActorInfo(*np.mean(infos, axis=1))
        info = ActorInfo(tmp.episode, int(tmp.frame), tmp.reward, tmp.speed)
        result[ano] = info
    return result


log = get_logger()

if PRIORITIZED:
    memory = PrioReplayBuffer(BUFFER_SIZE)
else:
    memory = ReplayBuffer(BUFFER_SIZE)

context = zmq.Context()

# 액터/러너에게서 받을 소켓
recv = context.socket(zmq.PULL)
recv.bind("tcp://*:5558")

# 러너에게 보낼 소켓
learner = context.socket(zmq.REP)
learner.bind("tcp://*:5555")

actor_infos = defaultdict(lambda: deque(maxlen=300))  # 액터들이 보낸 정보

# 반복
while True:
Esempio n. 19
0
    def __init__(self, env, buffer_size=int(1e6), device=None):

        super(Random, self).__init__(env, device)
        self.replay_buffer = ReplayBuffer(self.obs_dim, self.act_dim,
                                          buffer_size)
Esempio n. 20
0

def average_actor_info(ainfos):
    """액터별로 정보 평균."""
    result = {}
    for ano, infos in ainfos.items():
        infos = ActorInfo(*zip(*infos))
        tmp = ActorInfo(*np.mean(infos, axis=1))
        info = ActorInfo(tmp.episode, int(tmp.frame), tmp.reward, tmp.speed)
        result[ano] = info
    return result


log = get_logger()

memory = ReplayBuffer(BUFFER_SIZE)

context = zmq.Context()

# 액터/러너에게서 받을 소켓
recv = context.socket(zmq.PULL)
recv.bind("tcp://*:6558")

# 러너에게 보낼 소켓
learner = context.socket(zmq.REP)
learner.bind("tcp://*:6555")

actor_infos = defaultdict(lambda: deque(maxlen=300))  # 액터들이 보낸 정보

# 반복
while True:
Esempio n. 21
0
class Custom(BaseAgent):
    def __init__(self,
                 env,
                 lr=1e-3,
                 gamma=0.99,
                 tau=0.005,
                 buffer_size=int(1e6),
                 start_timesteps=5000,
                 expl_noise=0.1,
                 batch_size=128,
                 policy_noise=0.2,
                 noise_clip=0.5,
                 policy_freq=2,
                 device=None,
                 **kwargs):

        super(Custom, self).__init__(env, device)

        self.actor = GaussianActor(self.obs_dim, self.act_dim, self.act_limit,
                                   **kwargs).to(self.device)
        self.actor_target = copy.deepcopy(self.actor)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr)

        self.behavior = GaussianActor(self.obs_dim, self.act_dim,
                                      self.act_limit, **kwargs).to(self.device)
        self.behavior_optimizer = torch.optim.Adam(self.behavior.parameters(),
                                                   lr=lr)

        self.critic = DoubleQvalueCritic(self.obs_dim, self.act_dim,
                                         **kwargs).to(self.device)
        self.critic_target = copy.deepcopy(self.critic)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=lr)

        self.replay_buffer = ReplayBuffer(self.obs_dim, self.act_dim,
                                          buffer_size)

        self.start_timesteps = start_timesteps
        self.expl_noise = expl_noise
        self.batch_size = batch_size
        self.lr = lr
        self.gamma = gamma
        self.tau = tau
        self.policy_noise = policy_noise
        self.noise_clip = noise_clip
        self.policy_freq = policy_freq
        self.total_it = 0

        self.c_loss, self.a_loss = [], []

    def act(self, obs):
        obs = torch.tensor(obs, dtype=torch.float32, device=self.device)
        return self.actor(obs, True).cpu().data.numpy().flatten()

    def behavior_init(self, iteration=1000):
        obs, action, reward, next_obs, done = self.replay_buffer.sample(
            self.batch_size)

    def train(self):

        obs, action, reward, next_obs, done = self.replay_buffer.sample(
            self.batch_size)

        self.total_it += 1

        cur_action = self.actor(obs)

        with torch.no_grad():
            # Select action according to policy and add clipped noise
            noise = (torch.randn_like(action) * self.policy_noise).clamp(
                -self.noise_clip, self.noise_clip)

            next_action = (self.actor_target(next_obs) + noise).clamp(
                -self.act_limit, self.act_limit)

            # Compute the target Q value
            target_Q1, target_Q2 = self.critic_target(next_obs, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + (1 - done) * self.gamma * target_Q

        # Get current Q estimates
        current_Q1, current_Q2 = self.critic(obs, action)

        # Compute critic loss
        critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(
            current_Q2, target_Q)

        # Optimize the critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        c_loss = critic_loss.item()
        self.critic_optimizer.step()
        a_loss = 0
        # Delayed policy updates
        if self.total_it % self.policy_freq == 0:

            for param in self.critic.parameters():
                param.requires_grad = False

            # Compute actor losse
            actor_loss = -self.critic.Q1(obs, cur_action).mean()

            # Optimize the actor
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            a_loss = actor_loss.item()
            self.actor_optimizer.step()

            for param in self.critic.parameters():
                param.requires_grad = True

            # Update the frozen target models
            for param, target_param in zip(self.critic.parameters(),
                                           self.critic_target.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

            for param, target_param in zip(self.actor.parameters(),
                                           self.actor_target.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)
        return c_loss, a_loss

    def step(self, t):
        c, a = self.train()
        self.c_loss.append(c)
        self.a_loss.append(a)
        if t % 100 == 0:
            #self.evaluate(self.env)
            print(
                f'Iteration {t}: Critic Loss: {np.mean(self.c_loss)}, Actor Loss: {np.mean(self.a_loss)*2}'
            )
            self.c_loss, self.a_loss = [], []
        self.episode_timesteps += 1
Esempio n. 22
0
class SAC(BaseAgent):
    def __init__(self,
                 env,
                 buffer_size=int(1e6),
                 gamma=0.99,
                 tau=0.005,
                 lr=1e-3,
                 start_timesteps=1000,
                 actor_train_freq=2,
                 batch_size=128,
                 init_temperature=0.1,
                 device=None):

        super(SAC, self).__init__(env, device)
        self.actor = SquashedGaussianActor(self.obs_dim, self.act_dim,
                                           self.act_limit).to(self.device)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr)

        self.critic = DoubleQvalueCritic(self.obs_dim,
                                         self.act_dim).to(self.device)
        self.critic_target = copy.deepcopy(self.critic)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=lr)

        # Adjustable alpha
        self.log_alpha = torch.tensor(np.log(init_temperature),
                                      requires_grad=True,
                                      device=self.device)
        self.target_entropy = -torch.prod(
            torch.Tensor(self.env.action_space.shape).to(self.device)).item()
        self.alpha_optimizer = torch.optim.Adam([self.log_alpha],
                                                lr=1e-4,
                                                betas=(0.5, 0.999))

        self.replay_buffer = ReplayBuffer(buffer_size)
        self.start_timesteps = start_timesteps
        self.tau = tau
        self.gamma = gamma
        self.alpha = self.log_alpha.exp()
        self.actor_train_freq = actor_train_freq
        self.batch_size = batch_size

    def offline_initialize(self, replay_buffer, epoch=1):
        conf = 2
        # PPO-style mini-batch training
        critic_losses, actor_losses = [], []
        idxes = np.arange(replay_buffer.size - 1)
        print(replay_buffer.size)
        for i in range(epoch):
            np.random.shuffle(idxes)
            for j in range(replay_buffer.size // self.batch_size):
                idx = idxes[i * self.batch_size:(i + 1) * self.batch_size]
                obs, action, reward, next_obs, done, next_action = replay_buffer.sample(
                    self.batch_size, True, idx)
                # SARSA-style policy evaluation
                #with torch.no_grad():
                #    # Compute the target Q value
                #    target_Q1, target_Q2 = self.critic_target(next_obs, next_action)
                #    target_Q = torch.min(target_Q1, target_Q2)
                #    target_Q = reward + (1 - done) * self.gamma * target_Q
                ## Get current Q estimates
                #current_Q1, current_Q2 = self.critic(obs, action)
                ## Compute critic loss
                #critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
                #critic_losses.append(critic_loss.item())
                ## Optimize the critic
                #self.critic_optimizer.zero_grad()
                #critic_loss.backward()
                #self.critic_optimizer.step()
                # Behavior cloning under entropy-regularization
                _, logprob = self.actor(obs)
                _action = 0.5 * torch.log((1 + action) / (1 - action))
                #actor_loss = (self.alpha * logprob - self.actor.logprob(obs, _action)).mean()
                actor_loss = -self.actor.logprob(obs, _action).mean()
                #print(action, _action)
                actor_losses.append(actor_loss.item())
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()

                #alpha_loss = (self.log_alpha * (-logprob - self.target_entropy).detach()).mean()
                #self.alpha_optimizer.zero_grad()
                #alpha_loss.backward()
                #self.alpha_optimizer.step()
                #self.alpha = self.log_alpha.exp()
                for param, target_param in zip(
                        self.critic.parameters(),
                        self.critic_target.parameters()):
                    target_param.data.copy_(self.tau * param.data +
                                            (1 - self.tau) * target_param.data)
            print(
                f'Epoch {i} Critic Loss: {np.mean(critic_losses)}, Actor Loss: {np.mean(actor_losses)}'
            )
            critic_losses, actor_losses = [], []

        # Approximate support with the learn policy
        self.lower_bound = np.zeros((replay_buffer.size, self.act_dim))
        self.upper_bound = np.zeros((replay_buffer.size, self.act_dim))
        idxes = np.arange(replay_buffer.size)
        for _ in range(epoch):
            for i in range(int(np.ceil(replay_buffer.size / self.batch_size))):
                idx = idxes[i * self.batch_size:(i + 1) * self.batch_size]
                obs, action, reward, next_obs, done = replay_buffer.sample(
                    self.batch_size, with_idxes=idx)
                mu, std = self.actor.mu_std(obs)
                self.lower_bound[i * self.batch_size:(i + 1) *
                                 self.batch_size] = mu - conf * std
                self.upper_bound[i * self.batch_size:(i + 1) *
                                 self.batch_size] = mu + conf * std

    def offline_improve(self, replay_buffer, epoch=10):

        self.actor = SquashedGaussianActor(self.obs_dim, self.act_dim,
                                           self.act_limit).to(self.device)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=1e-3)
        self.actor_target = copy.deepcopy(self.actor)
        # Adjustable alpha
        self.log_alpha = torch.tensor(np.log(0.1),
                                      requires_grad=True,
                                      device=self.device)
        self.target_entropy = -torch.prod(
            torch.Tensor(self.env.action_space.shape).to(self.device)).item()
        self.alpha_optimizer = torch.optim.Adam([self.log_alpha],
                                                lr=1e-4,
                                                betas=(0.5, 0.999))

        actor_losses, critic_losses = [], []
        idxes = np.arange(replay_buffer.size - 1)
        for i in range(epoch):
            np.random.shuffle(idxes)
            for j in range(replay_buffer.size // self.batch_size):
                idx = idxes[i * self.batch_size:(i + 1) * self.batch_size]
                obs, action, reward, next_obs, done = replay_buffer.sample(
                    self.batch_size, with_idxes=idx)
                if j % 100 == 0:
                    self.evaluate(self.env)
                # SARSA-style policy evaluation
                with torch.no_grad():
                    # No constrain
                    #next_action, logprob = self.actor(next_obs)
                    #target_Q1, target_Q2 = self.critic_target(next_obs, next_action)
                    #target_Q = torch.min(target_Q1, target_Q2)
                    #target_Q = reward + (1 - done) * self.gamma * (target_Q - self.alpha * logprob)

                    # Probablistically constrain
                    #mu, std = self.actor.mu_std(next_obs)
                    #a, b = (self.lower_bound[idx+1] - mu)/std, (self.upper_bound[idx+1] - mu)/std
                    #dist = truncnorm(a, b, loc=mu, scale=std)
                    #next_action = torch.tensor(dist.rvs(), dtype=torch.float32, device=self.device)
                    #logprob = self.actor.logprob(next_obs, next_action)
                    #target_Q1, target_Q2 = self.critic_target(next_obs, torch.tanh(next_action))
                    #target_Q = torch.min(target_Q1, target_Q2)
                    #target_Q = reward + (1 - done) * self.gamma * (target_Q - self.alpha * logprob)

                    # Q-learning constrain
                    mu, std = self.actor_target.mu_std(next_obs)
                    next_action = mu  #np.clip(mu, self.lower_bound[idx+1], self.upper_bound[idx+1])
                    next_action = torch.tensor(self.act_limit *
                                               np.tanh(next_action),
                                               dtype=torch.float32,
                                               device=self.device)
                    target_Q1, target_Q2 = self.critic_target(
                        next_obs, next_action)
                    target_Q = torch.min(target_Q1, target_Q2)
                    target_Q = reward + (
                        1 - done
                    ) * self.gamma * target_Q  #(target_Q - self.alpha * logprob)
                # Get current Q estimates
                current_Q1, current_Q2 = self.critic(obs, action)
                # Compute critic loss
                critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(
                    current_Q2, target_Q)
                critic_losses.append(critic_loss.item())
                # Optimize the critic
                self.critic_optimizer.zero_grad()
                critic_loss.backward()
                self.critic_optimizer.step()
                # Behavior cloning under entropy-regularization

                # TODO: freeze critic parameters here to prevent unnecessary backpropagation
                for param in self.critic.parameters():
                    param.requires_grad = False

                cur_action, _ = self.actor.mu_std(obs, False)
                cur_action = torch.tanh(cur_action)
                current_Q1, current_Q2 = self.critic(obs, cur_action)
                current_Q = torch.min(current_Q1, current_Q2)
                actor_loss = -current_Q.mean()
                #cur_action, logprob = self.actor(obs, detach=True)
                #current_Q1, current_Q2 = self.critic(obs, cur_action)
                #current_Q = torch.min(current_Q1, current_Q2)
                #actor_std_loss = (self.alpha * logprob - current_Q).mean()
                #actor_loss = actor_std_loss + actor_mu_loss

                #cur_action, logprob = self.actor(obs, detach=True)
                #current_Q1, current_Q2 = self.critic(obs, cur_action)
                #current_Q = torch.min(current_Q1, current_Q2)
                #actor_loss = (self.alpha * logprob - current_Q).mean()

                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                actor_losses.append(-current_Q.mean().item())
                self.actor_optimizer.step()

                for param in self.critic.parameters():
                    param.requires_grad = True

                #alpha_loss = (self.log_alpha * (-logprob - 3 * self.target_entropy).detach()).mean()
                #self.alpha_optimizer.zero_grad()
                #alpha_loss.backward()
                #self.alpha_optimizer.step()
                #self.alpha = self.log_alpha.exp()
                for param, target_param in zip(
                        self.critic.parameters(),
                        self.critic_target.parameters()):
                    target_param.data.copy_(self.tau * param.data +
                                            (1 - self.tau) * target_param.data)
                for param, target_param in zip(self.actor.parameters(),
                                               self.actor_target.parameters()):
                    target_param.data.copy_(self.tau * param.data +
                                            (1 - self.tau) * target_param.data)
            print(
                f'Epoch {i} Critic Loss: {np.mean(critic_losses)}, Actor Loss: {np.mean(actor_losses)}'
            )
            critic_losses, actor_losses = [], []

    def train(self, obs, action, next_obs, reward, done):

        with torch.no_grad():

            next_action, logprob = self.actor(next_obs)
            # Compute the target Q value
            target_Q1, target_Q2 = self.critic_target(next_obs, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + (1 - done) * self.gamma * (
                target_Q - self.alpha * logprob)

        # Get current Q estimates
        current_Q1, current_Q2 = self.critic(obs, action)

        # Compute critic loss
        critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(
            current_Q2, target_Q)

        # Optimize the critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        cur_action, logprob = self.actor(obs)

        # TODO: freeze critic parameters here to prevent unnecessary backpropagation
        for param in self.critic.parameters():
            param.requires_grad = False

        current_Q1, current_Q2 = self.critic(obs, cur_action)
        current_Q = torch.min(current_Q1, current_Q2)

        actor_loss = (self.alpha * logprob - current_Q).mean()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        for param in self.critic.parameters():
            param.requires_grad = True

        alpha_loss = (self.log_alpha *
                      (-logprob - self.target_entropy).detach()).mean()
        self.alpha_optimizer.zero_grad()
        alpha_loss.backward()
        self.alpha_optimizer.step()
        self.alpha = self.log_alpha.exp()

        for param, target_param in zip(self.critic.parameters(),
                                       self.critic_target.parameters()):
            target_param.data.copy_(self.tau * param.data +
                                    (1 - self.tau) * target_param.data)

    def act(self, obs):
        obs = torch.tensor(obs, dtype=torch.float32, device=self.device)
        obs = obs.reshape(1, -1)
        return self.actor.act(obs, deterministic=True)

    def step(self, t):

        self.episode_timesteps += 1

        # Select action randomly or according to policy
        #if t < self.start_timesteps:# or t > self.start_timesteps:
        #    action = self.env.action_space.sample()
        #else:
        #    action = self.actor.act(torch.tensor(self.obs, dtype=torch.float32, device=self.device))
        action = self.actor.act(
            torch.tensor(self.obs, dtype=torch.float32, device=self.device))

        # Perform action
        next_obs, reward, done, _ = self.env.step(action)
        #done_bool = float(done) if self.episode_timesteps < self.env._max_episode_steps else 0
        done_bool = float(
            done
        )  # if self.episode_timesteps < self.env._max_episode_steps else 0
        # Store data in replay buffer
        self.replay_buffer.add(copy.deepcopy(self.obs), action, next_obs,
                               reward, done_bool)
        self.obs = next_obs
        self.episode_reward += reward

        # Train agent after collecting sufficient data, extra training iterations added when first reached start_timesteps
        if t == self.start_timesteps:
            for _ in range(self.start_timesteps):
                batch = self.replay_buffer.sample(self.batch_size)
                self.train(*batch)
        elif t > self.start_timesteps:
            batch = self.replay_buffer.sample(self.batch_size)
            self.train(*batch)

        if done:
            self.episode_end_handle(t)