Esempio n. 1
0
class Custom(BaseAgent):
    def __init__(self,
                 env,
                 lr=1e-3,
                 gamma=0.99,
                 tau=0.005,
                 buffer_size=int(1e6),
                 start_timesteps=5000,
                 expl_noise=0.1,
                 batch_size=128,
                 policy_noise=0.2,
                 noise_clip=0.5,
                 policy_freq=2,
                 device=None,
                 **kwargs):

        super(Custom, self).__init__(env, device)

        self.actor = GaussianActor(self.obs_dim, self.act_dim, self.act_limit,
                                   **kwargs).to(self.device)
        self.actor_target = copy.deepcopy(self.actor)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr)

        self.behavior = GaussianActor(self.obs_dim, self.act_dim,
                                      self.act_limit, **kwargs).to(self.device)
        self.behavior_optimizer = torch.optim.Adam(self.behavior.parameters(),
                                                   lr=lr)

        self.critic = DoubleQvalueCritic(self.obs_dim, self.act_dim,
                                         **kwargs).to(self.device)
        self.critic_target = copy.deepcopy(self.critic)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=lr)

        self.replay_buffer = ReplayBuffer(self.obs_dim, self.act_dim,
                                          buffer_size)

        self.start_timesteps = start_timesteps
        self.expl_noise = expl_noise
        self.batch_size = batch_size
        self.lr = lr
        self.gamma = gamma
        self.tau = tau
        self.policy_noise = policy_noise
        self.noise_clip = noise_clip
        self.policy_freq = policy_freq
        self.total_it = 0

        self.c_loss, self.a_loss = [], []

    def act(self, obs):
        obs = torch.tensor(obs, dtype=torch.float32, device=self.device)
        return self.actor(obs, True).cpu().data.numpy().flatten()

    def behavior_init(self, iteration=1000):
        obs, action, reward, next_obs, done = self.replay_buffer.sample(
            self.batch_size)

    def train(self):

        obs, action, reward, next_obs, done = self.replay_buffer.sample(
            self.batch_size)

        self.total_it += 1

        cur_action = self.actor(obs)

        with torch.no_grad():
            # Select action according to policy and add clipped noise
            noise = (torch.randn_like(action) * self.policy_noise).clamp(
                -self.noise_clip, self.noise_clip)

            next_action = (self.actor_target(next_obs) + noise).clamp(
                -self.act_limit, self.act_limit)

            # Compute the target Q value
            target_Q1, target_Q2 = self.critic_target(next_obs, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + (1 - done) * self.gamma * target_Q

        # Get current Q estimates
        current_Q1, current_Q2 = self.critic(obs, action)

        # Compute critic loss
        critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(
            current_Q2, target_Q)

        # Optimize the critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        c_loss = critic_loss.item()
        self.critic_optimizer.step()
        a_loss = 0
        # Delayed policy updates
        if self.total_it % self.policy_freq == 0:

            for param in self.critic.parameters():
                param.requires_grad = False

            # Compute actor losse
            actor_loss = -self.critic.Q1(obs, cur_action).mean()

            # Optimize the actor
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            a_loss = actor_loss.item()
            self.actor_optimizer.step()

            for param in self.critic.parameters():
                param.requires_grad = True

            # Update the frozen target models
            for param, target_param in zip(self.critic.parameters(),
                                           self.critic_target.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

            for param, target_param in zip(self.actor.parameters(),
                                           self.actor_target.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)
        return c_loss, a_loss

    def step(self, t):
        c, a = self.train()
        self.c_loss.append(c)
        self.a_loss.append(a)
        if t % 100 == 0:
            #self.evaluate(self.env)
            print(
                f'Iteration {t}: Critic Loss: {np.mean(self.c_loss)}, Actor Loss: {np.mean(self.a_loss)*2}'
            )
            self.c_loss, self.a_loss = [], []
        self.episode_timesteps += 1
Esempio n. 2
0
class DDPG(BaseAgent):
	def __init__(self, env, lr=1e-3, gamma=0.99, tau=0.005, buffer_size=int(1e6),
	             start_timesteps=1000, expl_noise=0.1, batch_size=256,
				 device=None):

		super(DDPG, self).__init__(env, device)

		self.actor = DeterministicActor(self.obs_dim, self.act_dim, self.act_limit).to(self.device)
		self.actor_target = copy.deepcopy(self.actor)
		self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr)

		self.critic = QvalueCritic(self.obs_dim, self.act_dim).to(self.device)
		self.critic_target = copy.deepcopy(self.critic)
		self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=lr)

		self.replay_buffer = ReplayBuffer(self.obs_dim, self.act_dim, buffer_size)

		self.start_timesteps = start_timesteps
		self.expl_noise = expl_noise
		self.batch_size = batch_size
		self.lr = lr
		self.gamma = gamma
		self.tau = tau

	def act(self, obs):
		obs = torch.tensor(obs, dtype=torch.float32, device=self.device)
		return self.actor(obs).cpu().data.numpy().flatten()

	def train(self):

		obs, action, reward, next_obs, done = self.replay_buffer.sample(self.batch_size)

		# Compute the target Q value
		target_Q = self.critic_target(next_obs, self.actor_target(next_obs))
		target_Q = reward + (1 - done) * self.gamma * target_Q.detach()

		# Get current Q estimate
		current_Q = self.critic(obs, action)

		# Compute critic loss
		critic_loss = F.mse_loss(current_Q, target_Q)

		# Optimize the critic
		self.critic_optimizer.zero_grad()
		critic_loss.backward()
		self.critic_optimizer.step()

		# Compute actor loss
		actor_loss = -self.critic(obs, self.actor(obs)).mean()

		# Optimize the actor
		self.actor_optimizer.zero_grad()
		actor_loss.backward()
		self.actor_optimizer.step()

		# Update the frozen target models
		for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
			target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

		for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
			target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

	def step(self, t):

		self.episode_timesteps += 1

		# Select action randomly or according to policy
		if t < self.start_timesteps:
			action = self.env.action_space.sample()
		else:
			action = (
					self.actor.act(torch.tensor(self.obs, dtype=torch.float32, device=self.device))
					+ np.random.normal(0, self.act_limit * self.expl_noise, size=self.act_dim)
			).clip(-self.act_limit, self.act_limit)

		# Perform action
		next_obs, reward, done, _ = self.env.step(action)
		done_bool = float(done)# if self.episode_timesteps < self.env._max_episode_steps else 0
		# Store data in replay buffer
		self.replay_buffer.add(copy.deepcopy(self.obs), action, reward, next_obs, done_bool)
		self.obs = next_obs

		self.episode_reward += reward
		# Train agent after collecting sufficient data
		if t > self.start_timesteps:
			self.train()
		if done:
			self.episode_end_handle(t)
Esempio n. 3
0
class TD3(BaseAgent):
    def __init__(self,
                 env,
                 lr=3e-4,
                 gamma=0.99,
                 tau=0.005,
                 buffer_size=int(1e6),
                 start_timesteps=1000,
                 expl_noise=0.1,
                 batch_size=100,
                 policy_noise=0.2,
                 noise_clip=0.5,
                 policy_freq=2,
                 device=None,
                 **kwargs):

        super(TD3, self).__init__(env, device)

        self.actor = DeterministicActor(self.obs_dim, self.act_dim,
                                        self.act_limit,
                                        **kwargs).to(self.device)
        self.actor_target = copy.deepcopy(self.actor)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr)

        self.critic = DoubleQvalueCritic(self.obs_dim, self.act_dim,
                                         **kwargs).to(self.device)
        self.critic_target = copy.deepcopy(self.critic)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=lr)

        self.replay_buffer = ReplayBuffer(self.obs_dim, self.act_dim,
                                          buffer_size)

        self.start_timesteps = start_timesteps
        self.expl_noise = expl_noise
        self.batch_size = batch_size
        self.lr = lr
        self.gamma = gamma
        self.tau = tau
        self.policy_noise = policy_noise
        self.noise_clip = noise_clip
        self.policy_freq = policy_freq
        self.total_it = 0

    def act(self, obs):
        obs = torch.tensor(obs, dtype=torch.float32, device=self.device)
        return self.actor(obs).cpu().data.numpy().flatten()

    def train(self):

        obs, action, reward, next_obs, done = self.replay_buffer.sample(
            self.batch_size)

        self.total_it += 1

        cur_action = self.actor(obs)

        with torch.no_grad():
            # Select action according to policy and add clipped noise
            noise = (torch.randn_like(action) * self.policy_noise).clamp(
                -self.noise_clip, self.noise_clip)

            next_action = (self.actor_target(next_obs) + noise).clamp(
                -self.act_limit, self.act_limit)

            # Compute the target Q value
            target_Q1, target_Q2 = self.critic_target(next_obs, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + (1 - done) * self.gamma * target_Q

        # Get current Q estimates
        current_Q1, current_Q2 = self.critic(obs, action)

        # Compute critic loss
        critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(
            current_Q2, target_Q)

        # Optimize the critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Delayed policy updates
        if self.total_it % self.policy_freq == 0:

            for param in self.critic.parameters():
                param.requires_grad = False

            # Compute actor losse
            actor_loss = -self.critic.Q1(obs, cur_action).mean()
            #target = 1 / (2 * 0.2) * grad(self.critic.Q1(obs, cur_action).mean(), cur_action)[0].detach() + self.actor_target(obs).detach()
            #actor_loss = F.mse_loss(target, cur_action)

            # Optimize the actor
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            for param in self.critic.parameters():
                param.requires_grad = True

            # Update the frozen target models
            for param, target_param in zip(self.critic.parameters(),
                                           self.critic_target.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

            for param, target_param in zip(self.actor.parameters(),
                                           self.actor_target.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

    def step(self, t):

        self.episode_timesteps += 1

        # Select action randomly or according to policy
        if t < self.start_timesteps:
            action = self.env.action_space.sample()
        else:
            action = (self.actor.act(
                torch.tensor(self.obs, dtype=torch.float32,
                             device=self.device)) +
                      np.random.normal(0,
                                       self.act_limit * self.expl_noise,
                                       size=self.act_dim)).clip(
                                           -self.act_limit, self.act_limit)

        # Perform action
        next_obs, reward, done, _ = self.env.step(action)
        done_bool = float(
            done
        )  # if self.episode_timesteps < self.env._max_episode_steps else 0
        # Store data in replay buffer
        self.replay_buffer.add(copy.deepcopy(self.obs), action, reward,
                               next_obs, done_bool)
        self.obs = next_obs

        self.episode_reward += reward
        # Train agent after collecting sufficient data
        # TODO: extra training to compensate for inti_timesteps?
        if t > self.start_timesteps:
            self.train()

        if done:
            self.episode_end_handle(t)
Esempio n. 4
0
class SAC(BaseAgent):
    def __init__(self,
                 env,
                 buffer_size=int(1e6),
                 gamma=0.99,
                 tau=0.005,
                 lr=1e-3,
                 start_timesteps=1000,
                 actor_train_freq=2,
                 batch_size=128,
                 init_temperature=0.1,
                 device=None):

        super(SAC, self).__init__(env, device)
        self.actor = SquashedGaussianActor(self.obs_dim, self.act_dim,
                                           self.act_limit).to(self.device)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr)

        self.critic = DoubleQvalueCritic(self.obs_dim,
                                         self.act_dim).to(self.device)
        self.critic_target = copy.deepcopy(self.critic)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=lr)

        # Adjustable alpha
        self.log_alpha = torch.tensor(np.log(init_temperature),
                                      requires_grad=True,
                                      device=self.device)
        self.target_entropy = -torch.prod(
            torch.Tensor(self.env.action_space.shape).to(self.device)).item()
        self.alpha_optimizer = torch.optim.Adam([self.log_alpha],
                                                lr=1e-4,
                                                betas=(0.5, 0.999))

        self.replay_buffer = ReplayBuffer(buffer_size)
        self.start_timesteps = start_timesteps
        self.tau = tau
        self.gamma = gamma
        self.alpha = self.log_alpha.exp()
        self.actor_train_freq = actor_train_freq
        self.batch_size = batch_size

    def offline_initialize(self, replay_buffer, epoch=1):
        conf = 2
        # PPO-style mini-batch training
        critic_losses, actor_losses = [], []
        idxes = np.arange(replay_buffer.size - 1)
        print(replay_buffer.size)
        for i in range(epoch):
            np.random.shuffle(idxes)
            for j in range(replay_buffer.size // self.batch_size):
                idx = idxes[i * self.batch_size:(i + 1) * self.batch_size]
                obs, action, reward, next_obs, done, next_action = replay_buffer.sample(
                    self.batch_size, True, idx)
                # SARSA-style policy evaluation
                #with torch.no_grad():
                #    # Compute the target Q value
                #    target_Q1, target_Q2 = self.critic_target(next_obs, next_action)
                #    target_Q = torch.min(target_Q1, target_Q2)
                #    target_Q = reward + (1 - done) * self.gamma * target_Q
                ## Get current Q estimates
                #current_Q1, current_Q2 = self.critic(obs, action)
                ## Compute critic loss
                #critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
                #critic_losses.append(critic_loss.item())
                ## Optimize the critic
                #self.critic_optimizer.zero_grad()
                #critic_loss.backward()
                #self.critic_optimizer.step()
                # Behavior cloning under entropy-regularization
                _, logprob = self.actor(obs)
                _action = 0.5 * torch.log((1 + action) / (1 - action))
                #actor_loss = (self.alpha * logprob - self.actor.logprob(obs, _action)).mean()
                actor_loss = -self.actor.logprob(obs, _action).mean()
                #print(action, _action)
                actor_losses.append(actor_loss.item())
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()

                #alpha_loss = (self.log_alpha * (-logprob - self.target_entropy).detach()).mean()
                #self.alpha_optimizer.zero_grad()
                #alpha_loss.backward()
                #self.alpha_optimizer.step()
                #self.alpha = self.log_alpha.exp()
                for param, target_param in zip(
                        self.critic.parameters(),
                        self.critic_target.parameters()):
                    target_param.data.copy_(self.tau * param.data +
                                            (1 - self.tau) * target_param.data)
            print(
                f'Epoch {i} Critic Loss: {np.mean(critic_losses)}, Actor Loss: {np.mean(actor_losses)}'
            )
            critic_losses, actor_losses = [], []

        # Approximate support with the learn policy
        self.lower_bound = np.zeros((replay_buffer.size, self.act_dim))
        self.upper_bound = np.zeros((replay_buffer.size, self.act_dim))
        idxes = np.arange(replay_buffer.size)
        for _ in range(epoch):
            for i in range(int(np.ceil(replay_buffer.size / self.batch_size))):
                idx = idxes[i * self.batch_size:(i + 1) * self.batch_size]
                obs, action, reward, next_obs, done = replay_buffer.sample(
                    self.batch_size, with_idxes=idx)
                mu, std = self.actor.mu_std(obs)
                self.lower_bound[i * self.batch_size:(i + 1) *
                                 self.batch_size] = mu - conf * std
                self.upper_bound[i * self.batch_size:(i + 1) *
                                 self.batch_size] = mu + conf * std

    def offline_improve(self, replay_buffer, epoch=10):

        self.actor = SquashedGaussianActor(self.obs_dim, self.act_dim,
                                           self.act_limit).to(self.device)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=1e-3)
        self.actor_target = copy.deepcopy(self.actor)
        # Adjustable alpha
        self.log_alpha = torch.tensor(np.log(0.1),
                                      requires_grad=True,
                                      device=self.device)
        self.target_entropy = -torch.prod(
            torch.Tensor(self.env.action_space.shape).to(self.device)).item()
        self.alpha_optimizer = torch.optim.Adam([self.log_alpha],
                                                lr=1e-4,
                                                betas=(0.5, 0.999))

        actor_losses, critic_losses = [], []
        idxes = np.arange(replay_buffer.size - 1)
        for i in range(epoch):
            np.random.shuffle(idxes)
            for j in range(replay_buffer.size // self.batch_size):
                idx = idxes[i * self.batch_size:(i + 1) * self.batch_size]
                obs, action, reward, next_obs, done = replay_buffer.sample(
                    self.batch_size, with_idxes=idx)
                if j % 100 == 0:
                    self.evaluate(self.env)
                # SARSA-style policy evaluation
                with torch.no_grad():
                    # No constrain
                    #next_action, logprob = self.actor(next_obs)
                    #target_Q1, target_Q2 = self.critic_target(next_obs, next_action)
                    #target_Q = torch.min(target_Q1, target_Q2)
                    #target_Q = reward + (1 - done) * self.gamma * (target_Q - self.alpha * logprob)

                    # Probablistically constrain
                    #mu, std = self.actor.mu_std(next_obs)
                    #a, b = (self.lower_bound[idx+1] - mu)/std, (self.upper_bound[idx+1] - mu)/std
                    #dist = truncnorm(a, b, loc=mu, scale=std)
                    #next_action = torch.tensor(dist.rvs(), dtype=torch.float32, device=self.device)
                    #logprob = self.actor.logprob(next_obs, next_action)
                    #target_Q1, target_Q2 = self.critic_target(next_obs, torch.tanh(next_action))
                    #target_Q = torch.min(target_Q1, target_Q2)
                    #target_Q = reward + (1 - done) * self.gamma * (target_Q - self.alpha * logprob)

                    # Q-learning constrain
                    mu, std = self.actor_target.mu_std(next_obs)
                    next_action = mu  #np.clip(mu, self.lower_bound[idx+1], self.upper_bound[idx+1])
                    next_action = torch.tensor(self.act_limit *
                                               np.tanh(next_action),
                                               dtype=torch.float32,
                                               device=self.device)
                    target_Q1, target_Q2 = self.critic_target(
                        next_obs, next_action)
                    target_Q = torch.min(target_Q1, target_Q2)
                    target_Q = reward + (
                        1 - done
                    ) * self.gamma * target_Q  #(target_Q - self.alpha * logprob)
                # Get current Q estimates
                current_Q1, current_Q2 = self.critic(obs, action)
                # Compute critic loss
                critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(
                    current_Q2, target_Q)
                critic_losses.append(critic_loss.item())
                # Optimize the critic
                self.critic_optimizer.zero_grad()
                critic_loss.backward()
                self.critic_optimizer.step()
                # Behavior cloning under entropy-regularization

                # TODO: freeze critic parameters here to prevent unnecessary backpropagation
                for param in self.critic.parameters():
                    param.requires_grad = False

                cur_action, _ = self.actor.mu_std(obs, False)
                cur_action = torch.tanh(cur_action)
                current_Q1, current_Q2 = self.critic(obs, cur_action)
                current_Q = torch.min(current_Q1, current_Q2)
                actor_loss = -current_Q.mean()
                #cur_action, logprob = self.actor(obs, detach=True)
                #current_Q1, current_Q2 = self.critic(obs, cur_action)
                #current_Q = torch.min(current_Q1, current_Q2)
                #actor_std_loss = (self.alpha * logprob - current_Q).mean()
                #actor_loss = actor_std_loss + actor_mu_loss

                #cur_action, logprob = self.actor(obs, detach=True)
                #current_Q1, current_Q2 = self.critic(obs, cur_action)
                #current_Q = torch.min(current_Q1, current_Q2)
                #actor_loss = (self.alpha * logprob - current_Q).mean()

                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                actor_losses.append(-current_Q.mean().item())
                self.actor_optimizer.step()

                for param in self.critic.parameters():
                    param.requires_grad = True

                #alpha_loss = (self.log_alpha * (-logprob - 3 * self.target_entropy).detach()).mean()
                #self.alpha_optimizer.zero_grad()
                #alpha_loss.backward()
                #self.alpha_optimizer.step()
                #self.alpha = self.log_alpha.exp()
                for param, target_param in zip(
                        self.critic.parameters(),
                        self.critic_target.parameters()):
                    target_param.data.copy_(self.tau * param.data +
                                            (1 - self.tau) * target_param.data)
                for param, target_param in zip(self.actor.parameters(),
                                               self.actor_target.parameters()):
                    target_param.data.copy_(self.tau * param.data +
                                            (1 - self.tau) * target_param.data)
            print(
                f'Epoch {i} Critic Loss: {np.mean(critic_losses)}, Actor Loss: {np.mean(actor_losses)}'
            )
            critic_losses, actor_losses = [], []

    def train(self, obs, action, next_obs, reward, done):

        with torch.no_grad():

            next_action, logprob = self.actor(next_obs)
            # Compute the target Q value
            target_Q1, target_Q2 = self.critic_target(next_obs, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + (1 - done) * self.gamma * (
                target_Q - self.alpha * logprob)

        # Get current Q estimates
        current_Q1, current_Q2 = self.critic(obs, action)

        # Compute critic loss
        critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(
            current_Q2, target_Q)

        # Optimize the critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        cur_action, logprob = self.actor(obs)

        # TODO: freeze critic parameters here to prevent unnecessary backpropagation
        for param in self.critic.parameters():
            param.requires_grad = False

        current_Q1, current_Q2 = self.critic(obs, cur_action)
        current_Q = torch.min(current_Q1, current_Q2)

        actor_loss = (self.alpha * logprob - current_Q).mean()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        for param in self.critic.parameters():
            param.requires_grad = True

        alpha_loss = (self.log_alpha *
                      (-logprob - self.target_entropy).detach()).mean()
        self.alpha_optimizer.zero_grad()
        alpha_loss.backward()
        self.alpha_optimizer.step()
        self.alpha = self.log_alpha.exp()

        for param, target_param in zip(self.critic.parameters(),
                                       self.critic_target.parameters()):
            target_param.data.copy_(self.tau * param.data +
                                    (1 - self.tau) * target_param.data)

    def act(self, obs):
        obs = torch.tensor(obs, dtype=torch.float32, device=self.device)
        obs = obs.reshape(1, -1)
        return self.actor.act(obs, deterministic=True)

    def step(self, t):

        self.episode_timesteps += 1

        # Select action randomly or according to policy
        #if t < self.start_timesteps:# or t > self.start_timesteps:
        #    action = self.env.action_space.sample()
        #else:
        #    action = self.actor.act(torch.tensor(self.obs, dtype=torch.float32, device=self.device))
        action = self.actor.act(
            torch.tensor(self.obs, dtype=torch.float32, device=self.device))

        # Perform action
        next_obs, reward, done, _ = self.env.step(action)
        #done_bool = float(done) if self.episode_timesteps < self.env._max_episode_steps else 0
        done_bool = float(
            done
        )  # if self.episode_timesteps < self.env._max_episode_steps else 0
        # Store data in replay buffer
        self.replay_buffer.add(copy.deepcopy(self.obs), action, next_obs,
                               reward, done_bool)
        self.obs = next_obs
        self.episode_reward += reward

        # Train agent after collecting sufficient data, extra training iterations added when first reached start_timesteps
        if t == self.start_timesteps:
            for _ in range(self.start_timesteps):
                batch = self.replay_buffer.sample(self.batch_size)
                self.train(*batch)
        elif t > self.start_timesteps:
            batch = self.replay_buffer.sample(self.batch_size)
            self.train(*batch)

        if done:
            self.episode_end_handle(t)
Esempio n. 5
0
        memory.merge(batch)
        actor_infos[actor_id].append(ainfo)

        log("receive replay - memory size {} elapse {:.2f}".format(
            len(memory),
            time.time() - st))

    # 러너가 배치를 요청했으면 보냄
    payload = async_recv(learner)
    if payload is not None:
        st = time.time()

        # 러너가 보낸 베치와 에러
        if len(actor_infos) > 0:
            ainfos = average_actor_info(actor_infos)
        else:
            ainfos = None

        if len(memory) < START_SIZE:
            payload = b'not enough'
            log("not enough data - memory size {}".format(len(memory)))
        else:
            # 충분하면 샘플링 후 보냄
            binfo = BufferInfo(len(memory))
            batch = memory.sample(NUM_BATCH)
            payload = pickle.dumps((batch, ainfos, binfo))

        # 전송
        learner.send(payload)
        log("send batch elapse {:.2f}".format(time.time() - st))
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Keep track of time step
        self.t = 0

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memoryimport matplotlib.pyplot as pltA
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # update time step
        self.t += 1

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            if self.t % UPDATE_EVERY == 0:
                for _ in range(NUM_UPDATES):
                    experiences = self.memory.sample()
                    self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic lossimport matplotlib.pyplot as pltA
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)