Python ReplayBuffer.sample Examples

Programming Language: Python

Namespace/Package Name: common

Class/Type: ReplayBuffer

Method/Function: sample

Examples at hotexamples.com: 6

Python ReplayBuffer.sample - 6 examples found. These are the top rated real world Python examples of common.ReplayBuffer.sample extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

ReplayBuffer(12)

add(7)

sample(6)

get_batch(3)

size(3)

add_many(2)

merge(1)

observe_episode(1)

update(1)

Example #1

Show file

class Custom(BaseAgent):
    def __init__(self,
                 env,
                 lr=1e-3,
                 gamma=0.99,
                 tau=0.005,
                 buffer_size=int(1e6),
                 start_timesteps=5000,
                 expl_noise=0.1,
                 batch_size=128,
                 policy_noise=0.2,
                 noise_clip=0.5,
                 policy_freq=2,
                 device=None,
                 **kwargs):

        super(Custom, self).__init__(env, device)

        self.actor = GaussianActor(self.obs_dim, self.act_dim, self.act_limit,
                                   **kwargs).to(self.device)
        self.actor_target = copy.deepcopy(self.actor)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr)

        self.behavior = GaussianActor(self.obs_dim, self.act_dim,
                                      self.act_limit, **kwargs).to(self.device)
        self.behavior_optimizer = torch.optim.Adam(self.behavior.parameters(),
                                                   lr=lr)

        self.critic = DoubleQvalueCritic(self.obs_dim, self.act_dim,
                                         **kwargs).to(self.device)
        self.critic_target = copy.deepcopy(self.critic)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=lr)

        self.replay_buffer = ReplayBuffer(self.obs_dim, self.act_dim,
                                          buffer_size)

        self.start_timesteps = start_timesteps
        self.expl_noise = expl_noise
        self.batch_size = batch_size
        self.lr = lr
        self.gamma = gamma
        self.tau = tau
        self.policy_noise = policy_noise
        self.noise_clip = noise_clip
        self.policy_freq = policy_freq
        self.total_it = 0

        self.c_loss, self.a_loss = [], []

    def act(self, obs):
        obs = torch.tensor(obs, dtype=torch.float32, device=self.device)
        return self.actor(obs, True).cpu().data.numpy().flatten()

    def behavior_init(self, iteration=1000):
        obs, action, reward, next_obs, done = self.replay_buffer.sample(
            self.batch_size)

    def train(self):

        obs, action, reward, next_obs, done = self.replay_buffer.sample(
            self.batch_size)

        self.total_it += 1

        cur_action = self.actor(obs)

        with torch.no_grad():
            # Select action according to policy and add clipped noise
            noise = (torch.randn_like(action) * self.policy_noise).clamp(
                -self.noise_clip, self.noise_clip)

            next_action = (self.actor_target(next_obs) + noise).clamp(
                -self.act_limit, self.act_limit)

            # Compute the target Q value
            target_Q1, target_Q2 = self.critic_target(next_obs, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + (1 - done) * self.gamma * target_Q

        # Get current Q estimates
        current_Q1, current_Q2 = self.critic(obs, action)

        # Compute critic loss
        critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(
            current_Q2, target_Q)

        # Optimize the critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        c_loss = critic_loss.item()
        self.critic_optimizer.step()
        a_loss = 0
        # Delayed policy updates
        if self.total_it % self.policy_freq == 0:

            for param in self.critic.parameters():
                param.requires_grad = False

            # Compute actor losse
            actor_loss = -self.critic.Q1(obs, cur_action).mean()

            # Optimize the actor
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            a_loss = actor_loss.item()
            self.actor_optimizer.step()

            for param in self.critic.parameters():
                param.requires_grad = True

            # Update the frozen target models
            for param, target_param in zip(self.critic.parameters(),
                                           self.critic_target.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

            for param, target_param in zip(self.actor.parameters(),
                                           self.actor_target.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)
        return c_loss, a_loss

    def step(self, t):
        c, a = self.train()
        self.c_loss.append(c)
        self.a_loss.append(a)
        if t % 100 == 0:
            #self.evaluate(self.env)
            print(
                f'Iteration {t}: Critic Loss: {np.mean(self.c_loss)}, Actor Loss: {np.mean(self.a_loss)*2}'
            )
            self.c_loss, self.a_loss = [], []
        self.episode_timesteps += 1

Example #2

Show file

File: ddpg.py Project: YufengYuan/DRL

class DDPG(BaseAgent):
	def __init__(self, env, lr=1e-3, gamma=0.99, tau=0.005, buffer_size=int(1e6),
	             start_timesteps=1000, expl_noise=0.1, batch_size=256,
				 device=None):

		super(DDPG, self).__init__(env, device)

		self.actor = DeterministicActor(self.obs_dim, self.act_dim, self.act_limit).to(self.device)
		self.actor_target = copy.deepcopy(self.actor)
		self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr)

		self.critic = QvalueCritic(self.obs_dim, self.act_dim).to(self.device)
		self.critic_target = copy.deepcopy(self.critic)
		self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=lr)

		self.replay_buffer = ReplayBuffer(self.obs_dim, self.act_dim, buffer_size)

		self.start_timesteps = start_timesteps
		self.expl_noise = expl_noise
		self.batch_size = batch_size
		self.lr = lr
		self.gamma = gamma
		self.tau = tau

	def act(self, obs):
		obs = torch.tensor(obs, dtype=torch.float32, device=self.device)
		return self.actor(obs).cpu().data.numpy().flatten()

	def train(self):

		obs, action, reward, next_obs, done = self.replay_buffer.sample(self.batch_size)

		# Compute the target Q value
		target_Q = self.critic_target(next_obs, self.actor_target(next_obs))
		target_Q = reward + (1 - done) * self.gamma * target_Q.detach()

		# Get current Q estimate
		current_Q = self.critic(obs, action)

		# Compute critic loss
		critic_loss = F.mse_loss(current_Q, target_Q)

		# Optimize the critic
		self.critic_optimizer.zero_grad()
		critic_loss.backward()
		self.critic_optimizer.step()

		# Compute actor loss
		actor_loss = -self.critic(obs, self.actor(obs)).mean()

		# Optimize the actor
		self.actor_optimizer.zero_grad()
		actor_loss.backward()
		self.actor_optimizer.step()

		# Update the frozen target models
		for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
			target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

		for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
			target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

	def step(self, t):

		self.episode_timesteps += 1

		# Select action randomly or according to policy
		if t < self.start_timesteps:
			action = self.env.action_space.sample()
		else:
			action = (
					self.actor.act(torch.tensor(self.obs, dtype=torch.float32, device=self.device))
					+ np.random.normal(0, self.act_limit * self.expl_noise, size=self.act_dim)
			).clip(-self.act_limit, self.act_limit)

		# Perform action
		next_obs, reward, done, _ = self.env.step(action)
		done_bool = float(done)# if self.episode_timesteps < self.env._max_episode_steps else 0
		# Store data in replay buffer
		self.replay_buffer.add(copy.deepcopy(self.obs), action, reward, next_obs, done_bool)
		self.obs = next_obs

		self.episode_reward += reward
		# Train agent after collecting sufficient data
		if t > self.start_timesteps:
			self.train()
		if done:
			self.episode_end_handle(t)

Example #3

Show file

class TD3(BaseAgent):
    def __init__(self,
                 env,
                 lr=3e-4,
                 gamma=0.99,
                 tau=0.005,
                 buffer_size=int(1e6),
                 start_timesteps=1000,
                 expl_noise=0.1,
                 batch_size=100,
                 policy_noise=0.2,
                 noise_clip=0.5,
                 policy_freq=2,
                 device=None,
                 **kwargs):

        super(TD3, self).__init__(env, device)

        self.actor = DeterministicActor(self.obs_dim, self.act_dim,
                                        self.act_limit,
                                        **kwargs).to(self.device)
        self.actor_target = copy.deepcopy(self.actor)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr)

        self.critic = DoubleQvalueCritic(self.obs_dim, self.act_dim,
                                         **kwargs).to(self.device)
        self.critic_target = copy.deepcopy(self.critic)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=lr)

        self.replay_buffer = ReplayBuffer(self.obs_dim, self.act_dim,
                                          buffer_size)

        self.start_timesteps = start_timesteps
        self.expl_noise = expl_noise
        self.batch_size = batch_size
        self.lr = lr
        self.gamma = gamma
        self.tau = tau
        self.policy_noise = policy_noise
        self.noise_clip = noise_clip
        self.policy_freq = policy_freq
        self.total_it = 0

    def act(self, obs):
        obs = torch.tensor(obs, dtype=torch.float32, device=self.device)
        return self.actor(obs).cpu().data.numpy().flatten()

    def train(self):

        obs, action, reward, next_obs, done = self.replay_buffer.sample(
            self.batch_size)

        self.total_it += 1

        cur_action = self.actor(obs)

        with torch.no_grad():
            # Select action according to policy and add clipped noise
            noise = (torch.randn_like(action) * self.policy_noise).clamp(
                -self.noise_clip, self.noise_clip)

            next_action = (self.actor_target(next_obs) + noise).clamp(
                -self.act_limit, self.act_limit)

            # Compute the target Q value
            target_Q1, target_Q2 = self.critic_target(next_obs, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + (1 - done) * self.gamma * target_Q

        # Get current Q estimates
        current_Q1, current_Q2 = self.critic(obs, action)

        # Compute critic loss
        critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(
            current_Q2, target_Q)

        # Optimize the critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Delayed policy updates
        if self.total_it % self.policy_freq == 0:

            for param in self.critic.parameters():
                param.requires_grad = False

            # Compute actor losse
            actor_loss = -self.critic.Q1(obs, cur_action).mean()
            #target = 1 / (2 * 0.2) * grad(self.critic.Q1(obs, cur_action).mean(), cur_action)[0].detach() + self.actor_target(obs).detach()
            #actor_loss = F.mse_loss(target, cur_action)

            # Optimize the actor
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            for param in self.critic.parameters():
                param.requires_grad = True

            # Update the frozen target models
            for param, target_param in zip(self.critic.parameters(),
                                           self.critic_target.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

            for param, target_param in zip(self.actor.parameters(),
                                           self.actor_target.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

    def step(self, t):

        self.episode_timesteps += 1

        # Select action randomly or according to policy
        if t < self.start_timesteps:
            action = self.env.action_space.sample()
        else:
            action = (self.actor.act(
                torch.tensor(self.obs, dtype=torch.float32,
                             device=self.device)) +
                      np.random.normal(0,
                                       self.act_limit * self.expl_noise,
                                       size=self.act_dim)).clip(
                                           -self.act_limit, self.act_limit)

        # Perform action
        next_obs, reward, done, _ = self.env.step(action)
        done_bool = float(
            done
        )  # if self.episode_timesteps < self.env._max_episode_steps else 0
        # Store data in replay buffer
        self.replay_buffer.add(copy.deepcopy(self.obs), action, reward,
                               next_obs, done_bool)
        self.obs = next_obs

        self.episode_reward += reward
        # Train agent after collecting sufficient data
        # TODO: extra training to compensate for inti_timesteps?
        if t > self.start_timesteps:
            self.train()

        if done:
            self.episode_end_handle(t)

Example #4

Show file

File: sac.py Project: YufengYuan/DRL

class SAC(BaseAgent):
    def __init__(self,
                 env,
                 buffer_size=int(1e6),
                 gamma=0.99,
                 tau=0.005,
                 lr=1e-3,
                 start_timesteps=1000,
                 actor_train_freq=2,
                 batch_size=128,
                 init_temperature=0.1,
                 device=None):

        super(SAC, self).__init__(env, device)
        self.actor = SquashedGaussianActor(self.obs_dim, self.act_dim,
                                           self.act_limit).to(self.device)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr)

        self.critic = DoubleQvalueCritic(self.obs_dim,
                                         self.act_dim).to(self.device)
        self.critic_target = copy.deepcopy(self.critic)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=lr)

        # Adjustable alpha
        self.log_alpha = torch.tensor(np.log(init_temperature),
                                      requires_grad=True,
                                      device=self.device)
        self.target_entropy = -torch.prod(
            torch.Tensor(self.env.action_space.shape).to(self.device)).item()
        self.alpha_optimizer = torch.optim.Adam([self.log_alpha],
                                                lr=1e-4,
                                                betas=(0.5, 0.999))

        self.replay_buffer = ReplayBuffer(buffer_size)
        self.start_timesteps = start_timesteps
        self.tau = tau
        self.gamma = gamma
        self.alpha = self.log_alpha.exp()
        self.actor_train_freq = actor_train_freq
        self.batch_size = batch_size

    def offline_initialize(self, replay_buffer, epoch=1):
        conf = 2
        # PPO-style mini-batch training
        critic_losses, actor_losses = [], []
        idxes = np.arange(replay_buffer.size - 1)
        print(replay_buffer.size)
        for i in range(epoch):
            np.random.shuffle(idxes)
            for j in range(replay_buffer.size // self.batch_size):
                idx = idxes[i * self.batch_size:(i + 1) * self.batch_size]
                obs, action, reward, next_obs, done, next_action = replay_buffer.sample(
                    self.batch_size, True, idx)
                # SARSA-style policy evaluation
                #with torch.no_grad():
                #    # Compute the target Q value
                #    target_Q1, target_Q2 = self.critic_target(next_obs, next_action)
                #    target_Q = torch.min(target_Q1, target_Q2)
                #    target_Q = reward + (1 - done) * self.gamma * target_Q
                ## Get current Q estimates
                #current_Q1, current_Q2 = self.critic(obs, action)
                ## Compute critic loss
                #critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
                #critic_losses.append(critic_loss.item())
                ## Optimize the critic
                #self.critic_optimizer.zero_grad()
                #critic_loss.backward()
                #self.critic_optimizer.step()
                # Behavior cloning under entropy-regularization
                _, logprob = self.actor(obs)
                _action = 0.5 * torch.log((1 + action) / (1 - action))
                #actor_loss = (self.alpha * logprob - self.actor.logprob(obs, _action)).mean()
                actor_loss = -self.actor.logprob(obs, _action).mean()
                #print(action, _action)
                actor_losses.append(actor_loss.item())
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()

                #alpha_loss = (self.log_alpha * (-logprob - self.target_entropy).detach()).mean()
                #self.alpha_optimizer.zero_grad()
                #alpha_loss.backward()
                #self.alpha_optimizer.step()
                #self.alpha = self.log_alpha.exp()
                for param, target_param in zip(
                        self.critic.parameters(),
                        self.critic_target.parameters()):
                    target_param.data.copy_(self.tau * param.data +
                                            (1 - self.tau) * target_param.data)
            print(
                f'Epoch {i} Critic Loss: {np.mean(critic_losses)}, Actor Loss: {np.mean(actor_losses)}'
            )
            critic_losses, actor_losses = [], []

        # Approximate support with the learn policy
        self.lower_bound = np.zeros((replay_buffer.size, self.act_dim))
        self.upper_bound = np.zeros((replay_buffer.size, self.act_dim))
        idxes = np.arange(replay_buffer.size)
        for _ in range(epoch):
            for i in range(int(np.ceil(replay_buffer.size / self.batch_size))):
                idx = idxes[i * self.batch_size:(i + 1) * self.batch_size]
                obs, action, reward, next_obs, done = replay_buffer.sample(
                    self.batch_size, with_idxes=idx)
                mu, std = self.actor.mu_std(obs)
                self.lower_bound[i * self.batch_size:(i + 1) *
                                 self.batch_size] = mu - conf * std
                self.upper_bound[i * self.batch_size:(i + 1) *
                                 self.batch_size] = mu + conf * std

    def offline_improve(self, replay_buffer, epoch=10):

        self.actor = SquashedGaussianActor(self.obs_dim, self.act_dim,
                                           self.act_limit).to(self.device)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=1e-3)
        self.actor_target = copy.deepcopy(self.actor)
        # Adjustable alpha
        self.log_alpha = torch.tensor(np.log(0.1),
                                      requires_grad=True,
                                      device=self.device)
        self.target_entropy = -torch.prod(
            torch.Tensor(self.env.action_space.shape).to(self.device)).item()
        self.alpha_optimizer = torch.optim.Adam([self.log_alpha],
                                                lr=1e-4,
                                                betas=(0.5, 0.999))

        actor_losses, critic_losses = [], []
        idxes = np.arange(replay_buffer.size - 1)
        for i in range(epoch):
            np.random.shuffle(idxes)
            for j in range(replay_buffer.size // self.batch_size):
                idx = idxes[i * self.batch_size:(i + 1) * self.batch_size]
                obs, action, reward, next_obs, done = replay_buffer.sample(
                    self.batch_size, with_idxes=idx)
                if j % 100 == 0:
                    self.evaluate(self.env)
                # SARSA-style policy evaluation
                with torch.no_grad():
                    # No constrain
                    #next_action, logprob = self.actor(next_obs)
                    #target_Q1, target_Q2 = self.critic_target(next_obs, next_action)
                    #target_Q = torch.min(target_Q1, target_Q2)
                    #target_Q = reward + (1 - done) * self.gamma * (target_Q - self.alpha * logprob)

                    # Probablistically constrain
                    #mu, std = self.actor.mu_std(next_obs)
                    #a, b = (self.lower_bound[idx+1] - mu)/std, (self.upper_bound[idx+1] - mu)/std
                    #dist = truncnorm(a, b, loc=mu, scale=std)
                    #next_action = torch.tensor(dist.rvs(), dtype=torch.float32, device=self.device)
                    #logprob = self.actor.logprob(next_obs, next_action)
                    #target_Q1, target_Q2 = self.critic_target(next_obs, torch.tanh(next_action))
                    #target_Q = torch.min(target_Q1, target_Q2)
                    #target_Q = reward + (1 - done) * self.gamma * (target_Q - self.alpha * logprob)

                    # Q-learning constrain
                    mu, std = self.actor_target.mu_std(next_obs)
                    next_action = mu  #np.clip(mu, self.lower_bound[idx+1], self.upper_bound[idx+1])
                    next_action = torch.tensor(self.act_limit *
                                               np.tanh(next_action),
                                               dtype=torch.float32,
                                               device=self.device)
                    target_Q1, target_Q2 = self.critic_target(
                        next_obs, next_action)
                    target_Q = torch.min(target_Q1, target_Q2)
                    target_Q = reward + (
                        1 - done
                    ) * self.gamma * target_Q  #(target_Q - self.alpha * logprob)
                # Get current Q estimates
                current_Q1, current_Q2 = self.critic(obs, action)
                # Compute critic loss
                critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(
                    current_Q2, target_Q)
                critic_losses.append(critic_loss.item())
                # Optimize the critic
                self.critic_optimizer.zero_grad()
                critic_loss.backward()
                self.critic_optimizer.step()
                # Behavior cloning under entropy-regularization

                # TODO: freeze critic parameters here to prevent unnecessary backpropagation
                for param in self.critic.parameters():
                    param.requires_grad = False

                cur_action, _ = self.actor.mu_std(obs, False)
                cur_action = torch.tanh(cur_action)
                current_Q1, current_Q2 = self.critic(obs, cur_action)
                current_Q = torch.min(current_Q1, current_Q2)
                actor_loss = -current_Q.mean()
                #cur_action, logprob = self.actor(obs, detach=True)
                #current_Q1, current_Q2 = self.critic(obs, cur_action)
                #current_Q = torch.min(current_Q1, current_Q2)
                #actor_std_loss = (self.alpha * logprob - current_Q).mean()
                #actor_loss = actor_std_loss + actor_mu_loss

                #cur_action, logprob = self.actor(obs, detach=True)
                #current_Q1, current_Q2 = self.critic(obs, cur_action)
                #current_Q = torch.min(current_Q1, current_Q2)
                #actor_loss = (self.alpha * logprob - current_Q).mean()

                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                actor_losses.append(-current_Q.mean().item())
                self.actor_optimizer.step()

                for param in self.critic.parameters():
                    param.requires_grad = True

                #alpha_loss = (self.log_alpha * (-logprob - 3 * self.target_entropy).detach()).mean()
                #self.alpha_optimizer.zero_grad()
                #alpha_loss.backward()
                #self.alpha_optimizer.step()
                #self.alpha = self.log_alpha.exp()
                for param, target_param in zip(
                        self.critic.parameters(),
                        self.critic_target.parameters()):
                    target_param.data.copy_(self.tau * param.data +
                                            (1 - self.tau) * target_param.data)
                for param, target_param in zip(self.actor.parameters(),
                                               self.actor_target.parameters()):
                    target_param.data.copy_(self.tau * param.data +
                                            (1 - self.tau) * target_param.data)
            print(
                f'Epoch {i} Critic Loss: {np.mean(critic_losses)}, Actor Loss: {np.mean(actor_losses)}'
            )
            critic_losses, actor_losses = [], []

    def train(self, obs, action, next_obs, reward, done):

        with torch.no_grad():

            next_action, logprob = self.actor(next_obs)
            # Compute the target Q value
            target_Q1, target_Q2 = self.critic_target(next_obs, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + (1 - done) * self.gamma * (
                target_Q - self.alpha * logprob)

        # Get current Q estimates
        current_Q1, current_Q2 = self.critic(obs, action)

        # Compute critic loss
        critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(
            current_Q2, target_Q)

        # Optimize the critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        cur_action, logprob = self.actor(obs)

        # TODO: freeze critic parameters here to prevent unnecessary backpropagation
        for param in self.critic.parameters():
            param.requires_grad = False

        current_Q1, current_Q2 = self.critic(obs, cur_action)
        current_Q = torch.min(current_Q1, current_Q2)

        actor_loss = (self.alpha * logprob - current_Q).mean()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        for param in self.critic.parameters():
            param.requires_grad = True

        alpha_loss = (self.log_alpha *
                      (-logprob - self.target_entropy).detach()).mean()
        self.alpha_optimizer.zero_grad()
        alpha_loss.backward()
        self.alpha_optimizer.step()
        self.alpha = self.log_alpha.exp()

        for param, target_param in zip(self.critic.parameters(),
                                       self.critic_target.parameters()):
            target_param.data.copy_(self.tau * param.data +
                                    (1 - self.tau) * target_param.data)

    def act(self, obs):
        obs = torch.tensor(obs, dtype=torch.float32, device=self.device)
        obs = obs.reshape(1, -1)
        return self.actor.act(obs, deterministic=True)

    def step(self, t):

        self.episode_timesteps += 1

        # Select action randomly or according to policy
        #if t < self.start_timesteps:# or t > self.start_timesteps:
        #    action = self.env.action_space.sample()
        #else:
        #    action = self.actor.act(torch.tensor(self.obs, dtype=torch.float32, device=self.device))
        action = self.actor.act(
            torch.tensor(self.obs, dtype=torch.float32, device=self.device))

        # Perform action
        next_obs, reward, done, _ = self.env.step(action)
        #done_bool = float(done) if self.episode_timesteps < self.env._max_episode_steps else 0
        done_bool = float(
            done
        )  # if self.episode_timesteps < self.env._max_episode_steps else 0
        # Store data in replay buffer
        self.replay_buffer.add(copy.deepcopy(self.obs), action, next_obs,
                               reward, done_bool)
        self.obs = next_obs
        self.episode_reward += reward

        # Train agent after collecting sufficient data, extra training iterations added when first reached start_timesteps
        if t == self.start_timesteps:
            for _ in range(self.start_timesteps):
                batch = self.replay_buffer.sample(self.batch_size)
                self.train(*batch)
        elif t > self.start_timesteps:
            batch = self.replay_buffer.sample(self.batch_size)
            self.train(*batch)

        if done:
            self.episode_end_handle(t)

Example #5

Show file

File: buffer.py Project: steffenvan/impala

        memory.merge(batch)
        actor_infos[actor_id].append(ainfo)

        log("receive replay - memory size {} elapse {:.2f}".format(
            len(memory),
            time.time() - st))

    # 러너가 배치를 요청했으면 보냄
    payload = async_recv(learner)
    if payload is not None:
        st = time.time()

        # 러너가 보낸 베치와 에러
        if len(actor_infos) > 0:
            ainfos = average_actor_info(actor_infos)
        else:
            ainfos = None

        if len(memory) < START_SIZE:
            payload = b'not enough'
            log("not enough data - memory size {}".format(len(memory)))
        else:
            # 충분하면 샘플링 후 보냄
            binfo = BufferInfo(len(memory))
            batch = memory.sample(NUM_BATCH)
            payload = pickle.dumps((batch, ainfos, binfo))

        # 전송
        learner.send(payload)
        log("send batch elapse {:.2f}".format(time.time() - st))

Example #6

Show file

File: agent.py Project: mavwong/Collaboration-and-Competition

class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Keep track of time step
        self.t = 0

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memoryimport matplotlib.pyplot as pltA
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # update time step
        self.t += 1

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            if self.t % UPDATE_EVERY == 0:
                for _ in range(NUM_UPDATES):
                    experiences = self.memory.sample()
                    self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic lossimport matplotlib.pyplot as pltA
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)