コード例 #1
0
    def __init__(self,
                 env_name,
                 env,
                 batch_size=32,
                 replay_memory_size=1e6,
                 history_size=4,
                 target_net_update_frequency=1e4,
                 gamma=0.99,
                 action_repeat=4,
                 lr=0.00025,
                 gradient_momentum=0.95,
                 initial_epsilon=1,
                 final_epsilon=0.1,
                 epsilon_decay_step=1e6,
                 warmup_step=5e4,
                 save_model_frequency=20,
                 eval_frequency=1):
        self.env_name = env_name
        self.env = env
        self.batch_size = batch_size
        self.replay_memory_size = replay_memory_size
        self.history_size = history_size
        self.target_net_update_frequency = target_net_update_frequency
        self.gamma = gamma
        self.action_repeat = action_repeat
        self.lr = lr
        self.gradient_momentum = gradient_momentum
        self.initial_epsilon = initial_epsilon
        self.final_epsilon = final_epsilon
        self.epsilon_decay_step = epsilon_decay_step
        self.warmup_step = warmup_step
        self.save_model_frequency = save_model_frequency
        self.eval_frequency = eval_frequency

        self.epsilon = self.initial_epsilon
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        print('Train on device:', self.device)

        self.memory = Memory(int(replay_memory_size), batch_size)
        self.net = Network(self.env.action_space.n).to(self.device)
        print(self.net)
        self.target_net = Network(self.env.action_space.n).to(self.device)
        self.update_model(self.target_net, self.net)
        self.opt = optim.RMSprop(self.net.parameters(),
                                 lr=self.lr,
                                 alpha=self.gradient_momentum)
        self.writer = SummaryWriter('./logs/DQN_{}_{}'.format(
            datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), self.env_name))
        self.loss_fn = F.mse_loss

        self.total_step = 0
コード例 #2
0
    def __init__(self,
                 env_name,
                 env,
                 actor_lr=3e-4,
                 critic_lr=3e-3,
                 gamma=0.99,
                 batch_size=32,
                 replay_memory_size=1e6,
                 is_test=False,
                 save_model_frequency=200,
                 eval_frequency=10):
        self.env_name = env_name
        self.env = env
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.gamma = gamma
        self.batch_size = batch_size
        self.replay_memory_size = replay_memory_size
        self.save_model_frequency = save_model_frequency
        self.eval_frequency = eval_frequency

        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print('Train on device:', self.device)
        if not is_test:
            self.writer = SummaryWriter('./logs/DDPG_{}'.format(self.env_name))
        self.loss_fn = F.mse_loss
        self.memory = Memory(int(replay_memory_size), batch_size)

        n_state, n_action = env.observation_space.shape[0], env.action_space.shape[0]
        self.noise = OUNoise(n_action)

        self.actor = DeterministicActor(n_state, n_action,
                                        action_scale=int(env.action_space.high[0])).to(self.device)
        self.target_actor = DeterministicActor(n_state, n_action,
                                               action_scale=int(env.action_space.high[0])).to(self.device)
        update_model(self.target_actor, self.actor)
        self.actor_opt = optim.Adam(self.actor.parameters(), lr=self.actor_lr)

        self.critic = Critic(n_state + n_action).to(self.device)
        self.target_critic = Critic(n_state + n_action).to(self.device)
        update_model(self.target_critic, self.critic)
        self.critic_opt = optim.Adam(self.critic.parameters(), lr=self.critic_lr)

        print(self.actor)
        print(self.critic)
コード例 #3
0
class DQN:
    def __init__(self,
                 env_name,
                 env,
                 batch_size=32,
                 replay_memory_size=1e6,
                 history_size=4,
                 target_net_update_frequency=1e4,
                 gamma=0.99,
                 action_repeat=4,
                 lr=0.00025,
                 gradient_momentum=0.95,
                 initial_epsilon=1,
                 final_epsilon=0.1,
                 epsilon_decay_step=1e6,
                 warmup_step=5e4,
                 save_model_frequency=20,
                 eval_frequency=1):
        self.env_name = env_name
        self.env = env
        self.batch_size = batch_size
        self.replay_memory_size = replay_memory_size
        self.history_size = history_size
        self.target_net_update_frequency = target_net_update_frequency
        self.gamma = gamma
        self.action_repeat = action_repeat
        self.lr = lr
        self.gradient_momentum = gradient_momentum
        self.initial_epsilon = initial_epsilon
        self.final_epsilon = final_epsilon
        self.epsilon_decay_step = epsilon_decay_step
        self.warmup_step = warmup_step
        self.save_model_frequency = save_model_frequency
        self.eval_frequency = eval_frequency

        self.epsilon = self.initial_epsilon
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        print('Train on device:', self.device)

        self.memory = Memory(int(replay_memory_size), batch_size)
        self.net = Network(self.env.action_space.n).to(self.device)
        print(self.net)
        self.target_net = Network(self.env.action_space.n).to(self.device)
        self.update_model(self.target_net, self.net)
        self.opt = optim.RMSprop(self.net.parameters(),
                                 lr=self.lr,
                                 alpha=self.gradient_momentum)
        self.writer = SummaryWriter('./logs/DQN_{}_{}'.format(
            datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), self.env_name))
        self.loss_fn = F.mse_loss

        self.total_step = 0

    def select_action(self, state, is_test=False):
        epsilon = 0.05 if is_test else self.epsilon
        if np.random.uniform(0, 1) < epsilon:
            a = self.env.action_space.sample()
        else:
            state = torch.tensor(state, dtype=torch.float).unsqueeze(0).to(
                self.device)
            a = self.net(state).cpu().detach().numpy().argmax()
        return a

    def learn(self):
        batch = self.memory.sample()
        loss = self.compute_loss(batch)
        self.opt.zero_grad()
        loss.backward()
        # for param in self.net.parameters():
        #     if param.grad is not None:
        #         param.grad.data.clamp_(-1, 1)
        for p in filter(lambda p: p.grad is not None, self.net.parameters()):
            p.grad.data.clamp_(min=-1, max=1)

        self.opt.step()

    def compute_loss(self, batch):
        batch_state, batch_action, batch_reward, batch_next_state, batch_done = batch.state, batch.action, batch.reward, batch.next_state, batch.done
        batch_state = torch.tensor(batch_state,
                                   dtype=torch.float,
                                   requires_grad=True).to(self.device)
        batch_action = torch.tensor(batch_action,
                                    dtype=torch.long).to(self.device)
        batch_reward = torch.tensor(batch_reward,
                                    dtype=torch.float,
                                    requires_grad=True).to(self.device)
        batch_next_state = torch.tensor(batch_next_state,
                                        dtype=torch.float,
                                        requires_grad=True).to(self.device)
        batch_mask = torch.tensor([not i for i in batch_done],
                                  dtype=torch.bool).to(self.device)

        pred_q = self.net(batch_state)
        pred_target_q = self.target_net(batch_next_state)
        q = torch.tensor(
            [i[idx] for idx, i in zip(batch_action.long(), pred_q)],
            dtype=torch.float,
            requires_grad=True).to(self.device)
        max_next_q = torch.tensor([i.max() for i in pred_target_q],
                                  dtype=torch.float,
                                  requires_grad=True).to(self.device)
        target_q = batch_mask * (batch_reward + self.gamma * max_next_q)

        loss = self.loss_fn(q, target_q)
        # self.writer.add_scalar('loss', loss, self.total_step)

        return loss

    def train(self, epochs):
        for epoch in range(epochs):
            s = self.env.reset()
            s = self.preprocess(s)
            s = np.stack((s[0], s[0], s[0], s[0]), axis=0)
            while True:
                # self.env.render()

                if self.total_step < self.warmup_step:
                    a = env.action_space.sample()
                else:
                    a = self.select_action(s)

                s_, r, done, _ = env.step(a)
                s_ = self.preprocess(s_)
                s_ = np.stack((s[1], s[2], s[3], s_[0]), axis=0)
                if r > 0:
                    r = 1
                elif r < 0:
                    r = -1
                self.memory.push(s, a, r, s_, done)
                s = s_
                self.total_step += 1
                self.epsilon = self.final_epsilon if self.total_step > self.epsilon_decay_step else self.initial_epsilon - (
                    self.initial_epsilon - self.final_epsilon
                ) * self.total_step / self.epsilon_decay_step

                if self.total_step % self.target_net_update_frequency == 0:
                    self.update_model(self.target_net, self.net)

                if len(self.memory) >= self.batch_size:
                    self.learn()

                if done:
                    break

            if (epoch + 1) % self.save_model_frequency == 0:
                self.save_model(self.net, 'model/model_DQN_{}'.format(epoch))

            if (epoch + 1) % self.eval_frequency == 0:
                eval_r = self.evaluate()
                print('epoch', epoch, 'reward', eval_r)
                self.writer.add_scalar('reward', eval_r, epoch)

    def preprocess(self, img):
        img = Image.fromarray(img)
        img_preprocess = transforms.Compose([
            transforms.Grayscale(1),
            transforms.Resize((84, 84)),
            transforms.CenterCrop(84),
            transforms.ToTensor(),
        ])
        img = img_preprocess(img)
        return img.numpy()

    def save_model(self, model, path):
        p = os.path.dirname(path)
        if not os.path.exists(p):
            os.mkdir(p)
        torch.save(model.state_dict(), path)

    def load_model(self, model, path):
        model.load_state_dict(torch.load(path))

    def update_model(self, target_model, model, tau=1):
        for target_param, param in zip(target_model.parameters(),
                                       model.parameters()):
            target_param.data.copy_(target_param.data * (1 - tau) +
                                    param.data * tau)

    def evaluate(self, epochs=3):
        total_r = 0
        for _ in range(epochs):
            s = self.env.reset()
            s = self.preprocess(s)
            s = np.stack((s[0], s[0], s[0], s[0]), axis=0)
            while True:
                a = self.select_action(s, is_test=True)
                s_, r, done, _ = self.env.step(a)
                s_ = self.preprocess(s_)
                s_ = np.stack((s[1], s[2], s[3], s_[0]), axis=0)
                total_r += r
                s = s_
                if done:
                    break

        return total_r / epochs
コード例 #4
0
class DDPG:

    def __init__(self,
                 env_name,
                 env,
                 actor_lr=3e-4,
                 critic_lr=3e-3,
                 gamma=0.99,
                 batch_size=32,
                 replay_memory_size=1e6,
                 is_test=False,
                 save_model_frequency=200,
                 eval_frequency=10):
        self.env_name = env_name
        self.env = env
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.gamma = gamma
        self.batch_size = batch_size
        self.replay_memory_size = replay_memory_size
        self.save_model_frequency = save_model_frequency
        self.eval_frequency = eval_frequency

        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print('Train on device:', self.device)
        if not is_test:
            self.writer = SummaryWriter('./logs/DDPG_{}'.format(self.env_name))
        self.loss_fn = F.mse_loss
        self.memory = Memory(int(replay_memory_size), batch_size)

        n_state, n_action = env.observation_space.shape[0], env.action_space.shape[0]
        self.noise = OUNoise(n_action)

        self.actor = DeterministicActor(n_state, n_action,
                                        action_scale=int(env.action_space.high[0])).to(self.device)
        self.target_actor = DeterministicActor(n_state, n_action,
                                               action_scale=int(env.action_space.high[0])).to(self.device)
        update_model(self.target_actor, self.actor)
        self.actor_opt = optim.Adam(self.actor.parameters(), lr=self.actor_lr)

        self.critic = Critic(n_state + n_action).to(self.device)
        self.target_critic = Critic(n_state + n_action).to(self.device)
        update_model(self.target_critic, self.critic)
        self.critic_opt = optim.Adam(self.critic.parameters(), lr=self.critic_lr)

        print(self.actor)
        print(self.critic)

    def select_action(self, state, is_test=False):
        state = torch.tensor(state, dtype=torch.float).to(self.device)
        if is_test:
            a = self.actor(state)
        else:
            a = self.actor(state) + torch.tensor(self.noise(), dtype=torch.float).to(self.device)
            a = a.clip(-self.actor.action_scale, self.actor.action_scale)
        return a.cpu().detach().numpy()

    def train(self, epochs):
        best_eval = -1e6
        for epoch in range(epochs):
            s = self.env.reset()
            policy_loss, critic_loss = 0, 0
            while True:
                self.env.render()
                a = self.select_action(s)
                s_, r, done, _ = self.env.step(a)
                self.memory.push(s, a, r, s_, done)
                if len(self.memory) > self.batch_size:
                    policy_loss, critic_loss = self.learn()
                s = s_
                if done:
                    break

            self.writer.add_scalar('loss/actor_loss', policy_loss, epoch)
            self.writer.add_scalar('loss/critic_loss', critic_loss, epoch)

            if (epoch + 1) % self.save_model_frequency == 0:
                save_model(self.critic, 'model/{}_model/critic_{}'.format(self.env_name, epoch))
                save_model(self.actor, 'model/{}_model/actor_{}'.format(self.env_name, epoch))

            if (epoch + 1) % self.eval_frequency == 0:
                eval_r = self.evaluate()
                print('epoch', epoch, 'evaluate reward', eval_r)
                self.writer.add_scalar('reward', eval_r, epoch)
                if eval_r > best_eval:
                    best_eval = eval_r
                    save_model(self.critic, 'model/{}_model/best_critic'.format(self.env_name))
                    save_model(self.actor, 'model/{}_model/best_actor'.format(self.env_name))

    def learn(self):
        batch = self.memory.sample()
        batch_state, batch_action, batch_reward, batch_next_state, batch_done = \
            batch.state, batch.action, batch.reward, batch.next_state, batch.done
        batch_state = torch.tensor(batch_state, dtype=torch.float).to(self.device)
        batch_action = torch.tensor(batch_action, dtype=torch.float).reshape(self.batch_size, -1).to(self.device)
        batch_reward = torch.tensor(batch_reward, dtype=torch.float).reshape(self.batch_size, -1).to(self.device)
        batch_next_state = torch.tensor(batch_next_state, dtype=torch.float).to(self.device)
        batch_mask = torch.tensor([not i for i in batch_done], dtype=torch.bool).reshape(self.batch_size, -1).to(self.device)

        # update critic
        pred_q = self.critic(torch.cat((batch_state, batch_action), dim=-1))
        next_action = self.target_actor(batch_next_state)
        next_q = self.target_critic(torch.cat((batch_next_state, next_action), dim=-1))
        pred_target_q = batch_reward + batch_mask * self.gamma * next_q
        critic_loss = self.loss_fn(pred_q, pred_target_q)

        self.critic_opt.zero_grad()
        critic_loss.backward()
        self.critic_opt.step()

        # update actor
        policy_loss = - self.critic(torch.cat((batch_state, self.actor(batch_state)), dim=-1)).mean()
        self.actor_opt.zero_grad()
        policy_loss.backward()
        self.actor_opt.step()

        # update target
        update_model(self.target_critic, self.critic, 0.05)
        update_model(self.target_actor, self.actor, 0.05)

        return policy_loss.item(), critic_loss.item()

    def evaluate(self, epochs=3, is_render=False):
        eval_r = 0
        for _ in range(epochs):
            s = self.env.reset()
            while True:
                if is_render:
                    self.env.render()
                with torch.no_grad():
                    a = self.select_action(s, is_test=True)
                s_, r, done, _ = self.env.step(a)
                s = s_
                eval_r += r
                if done:
                    break
        return eval_r / epochs
コード例 #5
0
    def __init__(self,
                 env_name,
                 env,
                 actor_lr=3e-4,
                 critic_lr=3e-4,
                 alpha_lr=3e-4,
                 gamma=0.99,
                 batch_size=64,
                 replay_memory_size=1e6,
                 update_frequency=2,
                 warmup_step=1e3,
                 tau=0.005,
                 alpha=None,
                 is_test=False,
                 save_model_frequency=200,
                 eval_frequency=10,
                 save_log_frequency=10):
        self.env_name = env_name
        self.env = env
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.alpha_lr = alpha_lr
        self.gamma = gamma
        self.batch_size = batch_size
        self.replay_memory_size = replay_memory_size
        self.update_frequency = update_frequency
        self.warmup_step = warmup_step
        self.tau = tau
        self.save_model_frequency = save_model_frequency
        self.eval_frequency = eval_frequency
        self.save_log_frequency = save_log_frequency

        self.total_step = 0
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        print('Train on device:', self.device)
        if not is_test:
            self.writer = SummaryWriter('./logs/SAC_{}'.format(self.env_name))
        self.loss_fn = F.mse_loss
        self.memory = Memory(int(replay_memory_size), batch_size)

        n_state, n_action = env.observation_space.shape[
            0], env.action_space.shape[0]
        self.state_normalize = ZFilter(n_state)
        if alpha is None:
            self.auto_tune_alpha = True
            self.target_entropy = -torch.prod(
                torch.Tensor(env.action_space.shape)).item()
            self.log_alpha = torch.zeros(1,
                                         requires_grad=True,
                                         device=self.device)
            self.alpha_opt = optim.Adam([self.log_alpha], lr=self.alpha_lr)
            print('Auto adjust alpha')
        else:
            self.auto_tune_alpha = False
            self.log_alpha = torch.log(torch.tensor(
                alpha, dtype=torch.float)).to(self.device)
            print('Fixed alpha')

        self.actor = SACGaussianActor(
            n_state, n_action, 256,
            action_scale=int(env.action_space.high[0])).to(self.device)
        self.actor_opt = optim.Adam(self.actor.parameters(), lr=self.actor_lr)
        self.critic = TwinCritic(n_state + n_action, 256).to(self.device)
        self.critic_opt = optim.Adam(self.critic.parameters(),
                                     lr=self.critic_lr)
        self.target_critic = TwinCritic(n_state + n_action,
                                        256).to(self.device)
        update_model(self.target_critic, self.critic)

        print(self.actor)
        print(self.critic)
コード例 #6
0
class SAC:
    def __init__(self,
                 env_name,
                 env,
                 actor_lr=3e-4,
                 critic_lr=3e-4,
                 alpha_lr=3e-4,
                 gamma=0.99,
                 batch_size=64,
                 replay_memory_size=1e6,
                 update_frequency=2,
                 warmup_step=1e3,
                 tau=0.005,
                 alpha=None,
                 is_test=False,
                 save_model_frequency=200,
                 eval_frequency=10,
                 save_log_frequency=10):
        self.env_name = env_name
        self.env = env
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.alpha_lr = alpha_lr
        self.gamma = gamma
        self.batch_size = batch_size
        self.replay_memory_size = replay_memory_size
        self.update_frequency = update_frequency
        self.warmup_step = warmup_step
        self.tau = tau
        self.save_model_frequency = save_model_frequency
        self.eval_frequency = eval_frequency
        self.save_log_frequency = save_log_frequency

        self.total_step = 0
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        print('Train on device:', self.device)
        if not is_test:
            self.writer = SummaryWriter('./logs/SAC_{}'.format(self.env_name))
        self.loss_fn = F.mse_loss
        self.memory = Memory(int(replay_memory_size), batch_size)

        n_state, n_action = env.observation_space.shape[
            0], env.action_space.shape[0]
        self.state_normalize = ZFilter(n_state)
        if alpha is None:
            self.auto_tune_alpha = True
            self.target_entropy = -torch.prod(
                torch.Tensor(env.action_space.shape)).item()
            self.log_alpha = torch.zeros(1,
                                         requires_grad=True,
                                         device=self.device)
            self.alpha_opt = optim.Adam([self.log_alpha], lr=self.alpha_lr)
            print('Auto adjust alpha')
        else:
            self.auto_tune_alpha = False
            self.log_alpha = torch.log(torch.tensor(
                alpha, dtype=torch.float)).to(self.device)
            print('Fixed alpha')

        self.actor = SACGaussianActor(
            n_state, n_action, 256,
            action_scale=int(env.action_space.high[0])).to(self.device)
        self.actor_opt = optim.Adam(self.actor.parameters(), lr=self.actor_lr)
        self.critic = TwinCritic(n_state + n_action, 256).to(self.device)
        self.critic_opt = optim.Adam(self.critic.parameters(),
                                     lr=self.critic_lr)
        self.target_critic = TwinCritic(n_state + n_action,
                                        256).to(self.device)
        update_model(self.target_critic, self.critic)

        print(self.actor)
        print(self.critic)

    def select_action(self, state, is_test=False):
        state = torch.tensor(state, dtype=torch.float).to(self.device)
        # BUG: 最好是对state进行unsqueeze后再输出,保证和batch的数据的输入维度相同
        a, log_prob = self.actor.sample(state, is_test)
        return a.cpu().detach().numpy(), log_prob.cpu().detach().numpy()

    def train(self, epochs):
        best_eval = -1e6
        for epoch in range(epochs):
            s = self.env.reset()
            s = self.state_normalize(s)
            policy_loss, critic_loss, alpha_loss = 0, 0, 0
            while True:
                self.env.render()
                a, _ = self.select_action(s)
                s_, r, done, _ = self.env.step(a)
                s_ = self.state_normalize(s_)
                self.memory.push(s, a, r, s_, done)
                self.total_step += 1
                if len(
                        self.memory
                ) > self.batch_size and self.total_step > self.warmup_step:
                    policy_loss, critic_loss, alpha_loss = self.learn()

                s = s_
                if done:
                    break

            if (epoch + 1) % self.save_log_frequency == 0:
                self.writer.add_scalar('loss/critic_loss', critic_loss,
                                       self.total_step)
                self.writer.add_scalar('loss/policy_loss', policy_loss,
                                       self.total_step)
                self.writer.add_scalar('alpha',
                                       self.log_alpha.exp().item(),
                                       self.total_step)
                self.writer.add_scalar('loss/alpha_loss', alpha_loss,
                                       self.total_step)

            if (epoch + 1) % self.save_model_frequency == 0:
                save_model(
                    self.critic,
                    'model/{}_model/critic_{}'.format(self.env_name, epoch))
                save_model(
                    self.actor,
                    'model/{}_model/actor_{}'.format(self.env_name, epoch))
                ZFilter.save(
                    self.state_normalize,
                    'model/{}_model/rs_{}'.format(self.env_name, epoch))

            if (epoch + 1) % self.eval_frequency == 0:
                eval_r = self.evaluate()
                print('epoch', epoch, 'evaluate reward', eval_r)
                self.writer.add_scalar('reward', eval_r, self.total_step)
                if eval_r > best_eval:
                    best_eval = eval_r
                    save_model(
                        self.critic,
                        'model/{}_model/best_critic'.format(self.env_name))
                    save_model(
                        self.actor,
                        'model/{}_model/best_actor'.format(self.env_name))
                    ZFilter.save(
                        self.state_normalize,
                        'model/{}_model/best_rs'.format(self.env_name))

    def learn(self):
        batch = self.memory.sample()
        batch_state, batch_action, batch_reward, batch_next_state, batch_done = \
            batch.state, batch.action, batch.reward, batch.next_state, batch.done
        batch_state = torch.tensor(batch_state,
                                   dtype=torch.float).to(self.device)
        batch_action = torch.tensor(batch_action, dtype=torch.float).reshape(
            self.batch_size, -1).to(self.device)
        batch_reward = torch.tensor(batch_reward, dtype=torch.float).reshape(
            self.batch_size, -1).to(self.device)
        batch_next_state = torch.tensor(batch_next_state,
                                        dtype=torch.float).to(self.device)
        batch_mask = torch.tensor([not i for i in batch_done],
                                  dtype=torch.bool).reshape(
                                      self.batch_size, -1).to(self.device)

        alpha = self.log_alpha.exp()
        # update critic
        with torch.no_grad():
            next_action, next_log_prob = self.actor.sample(batch_next_state)
            next_log_prob = next_log_prob.sum(1, keepdim=True)
            next_input = torch.cat([batch_next_state, next_action], dim=-1)
            target_q1, target_q2 = self.target_critic(next_input)
            target_q = batch_reward + batch_mask * self.gamma * (
                torch.min(target_q1, target_q2) - alpha * next_log_prob)

        q1, q2 = self.critic(torch.cat([batch_state, batch_action], dim=-1))
        critic_loss_1 = self.loss_fn(q1, target_q)
        critic_loss_2 = self.loss_fn(q2, target_q)
        critic_loss = critic_loss_1 + critic_loss_2

        self.critic_opt.zero_grad()
        critic_loss.backward()
        self.critic_opt.step()

        # update actor
        batch_pi, batch_pi_log_prob = self.actor.sample(batch_state)
        q1, q2 = self.critic(torch.cat([batch_state, batch_pi], dim=-1))
        batch_pi_log_prob = batch_pi_log_prob.sum(1, keepdim=True)
        policy_loss = (alpha * batch_pi_log_prob - torch.min(q1, q2)).mean()
        self.actor_opt.zero_grad()
        policy_loss.backward()
        self.actor_opt.step()

        # update alpha
        if self.auto_tune_alpha:
            alpha_loss = -(
                self.log_alpha *
                (batch_pi_log_prob + self.target_entropy).detach()).mean()
            self.alpha_opt.zero_grad()
            alpha_loss.backward()
            self.alpha_opt.step()
        else:
            alpha_loss = torch.tensor(0)

        if (self.total_step + 1) % self.update_frequency == 0:
            update_model(self.target_critic, self.critic, self.tau)

        return policy_loss.item(), critic_loss.item(), alpha_loss.item()

    def evaluate(self, epochs=3, is_render=False):
        eval_r = 0
        for _ in range(epochs):
            s = self.env.reset()
            s = self.state_normalize(s, update=False)
            while True:
                if is_render:
                    self.env.render()
                a, _ = self.select_action(s, is_test=True)
                s_, r, done, _ = self.env.step(a)
                s_ = self.state_normalize(s_, update=False)
                s = s_
                eval_r += r
                if done:
                    break
        return eval_r / epochs
コード例 #7
0
def collect_samples(pid, queue, env, policy, custom_reward, mean_action,
                    render, running_state, min_batch_size):
    if pid > 0:
        torch.manual_seed(torch.randint(0, 5000, (1, )) * pid)
        if hasattr(env, 'np_random'):
            env.np_random.seed(env.np_random.randint(5000) * pid)
        if hasattr(env, 'env') and hasattr(env.env, 'np_random'):
            env.env.np_random.seed(env.env.np_random.randint(5000) * pid)
    log = dict()
    memory = Memory()
    num_steps = 0
    total_reward = 0
    min_reward = 1e6
    max_reward = -1e6
    total_c_reward = 0
    min_c_reward = 1e6
    max_c_reward = -1e6
    num_episodes = 0

    while num_steps < min_batch_size:
        state = env.reset()
        if running_state is not None:
            state = running_state(state)
        reward_episode = 0

        for t in range(10000):
            state_var = tensor(state).unsqueeze(0)
            with torch.no_grad():
                if mean_action:
                    action = policy(state_var)[0][0].numpy()
                else:
                    action = policy.select_action(state_var)[0].numpy()
            action = int(action) if policy.is_disc_action else action.astype(
                np.float64)
            next_state, reward, done, _ = env.step(action)
            reward_episode += reward
            if running_state is not None:
                next_state = running_state(next_state)

            if custom_reward is not None:
                reward = custom_reward(state, action)
                total_c_reward += reward
                min_c_reward = min(min_c_reward, reward)
                max_c_reward = max(max_c_reward, reward)

            mask = 0 if done else 1

            memory.push(state, action, mask, next_state, reward)

            if render:
                env.render()
            if done:
                break

            state = next_state

        # log stats
        num_steps += (t + 1)
        num_episodes += 1
        total_reward += reward_episode
        min_reward = min(min_reward, reward_episode)
        max_reward = max(max_reward, reward_episode)

    log['num_steps'] = num_steps
    log['num_episodes'] = num_episodes
    log['total_reward'] = total_reward
    log['avg_reward'] = total_reward / num_episodes
    log['max_reward'] = max_reward
    log['min_reward'] = min_reward
    if custom_reward is not None:
        log['total_c_reward'] = total_c_reward
        log['avg_c_reward'] = total_c_reward / num_steps
        log['max_c_reward'] = max_c_reward
        log['min_c_reward'] = min_c_reward

    if queue is not None:
        queue.put([pid, memory, log])
    else:
        return memory, log