Esempio n. 1
0
    def __init__(self,
                 env,
                 log_dir,
                 shared_memory,
                 shared_weights,
                 batch_size=64,
                 lr=0.0003,
                 memory_size=1e5,
                 gamma=0.99,
                 tau=0.005,
                 multi_step=3,
                 per=True,
                 alpha=0.6,
                 beta=0.4,
                 beta_annealing=0.001,
                 grad_clip=5.0,
                 update_per_steps=4,
                 start_steps=1000,
                 log_interval=1,
                 memory_load_interval=5,
                 target_update_interval=1,
                 model_save_interval=5,
                 eval_interval=1000,
                 cuda=True,
                 seed=0):
        self.env = env
        torch.manual_seed(seed)
        np.random.seed(seed)
        self.env.seed(seed)

        self.shared_memory = shared_memory
        self.shared_weights = shared_weights

        self.device = torch.device(
            "cuda" if cuda and torch.cuda.is_available() else "cpu")

        self.policy = ConvCategoricalPolicy(
            self.env.observation_space.shape[0],
            self.env.action_space.n).to(self.device)
        self.critic = TwinedDiscreteConvQNetwork(
            self.env.observation_space.shape[0],
            self.env.action_space.n).to(self.device)
        self.critic_target = TwinedDiscreteConvQNetwork(
            self.env.observation_space.shape[0],
            self.env.action_space.n).to(self.device).eval()

        hard_update(self.critic_target, self.critic)
        self.policy_optim = Adam(self.policy.parameters(), lr=lr)
        self.q1_optim = Adam(self.critic.Q1.parameters(), lr=lr)
        self.q2_optim = Adam(self.critic.Q2.parameters(), lr=lr)

        self.target_entropy = np.log(self.env.action_space.n) * 0.98
        self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
        self.alpha = self.log_alpha.exp()
        self.alpha_optim = Adam([self.log_alpha], lr=lr)

        self.save_weights()
        if per:
            self.memory = DummyPrioritizedMemory(
                memory_size,
                self.env.observation_space.shape, (1, ),
                self.device,
                gamma,
                multi_step,
                alpha=alpha,
                beta=beta,
                beta_annealing=beta_annealing)
        else:
            self.memory = DummyMultiStepMemory(
                memory_size, self.env.observation_space.shape, (1, ),
                self.device, gamma, multi_step)

        self.log_dir = log_dir
        self.model_dir = os.path.join(log_dir, 'model')
        self.summary_dir = os.path.join(log_dir, 'summary', 'leaner')
        if not os.path.exists(self.model_dir):
            os.makedirs(self.model_dir)
        if not os.path.exists(self.summary_dir):
            os.makedirs(self.summary_dir)
        self.writer = SummaryWriter(log_dir=self.summary_dir)

        self.steps = 0
        self.epochs = 0
        self.tau = tau
        self.per = per
        self.batch_size = batch_size
        self.start_steps = start_steps
        self.gamma_n = gamma**multi_step
        self.grad_clip = grad_clip
        self.update_per_steps = update_per_steps
        self.log_interval = log_interval
        self.memory_load_interval = memory_load_interval
        self.model_save_interval = model_save_interval
        self.target_update_interval = target_update_interval
        self.eval_interval = eval_interval
Esempio n. 2
0
    def __init__(self,
                 env,
                 log_dir,
                 shared_memory,
                 shared_weights,
                 actor_id,
                 num_actors=1,
                 memory_size=1e4,
                 gamma=0.99,
                 multi_step=3,
                 per=True,
                 alpha=0.6,
                 beta=0.4,
                 beta_annealing=0.001,
                 start_steps=10000,
                 log_interval=10,
                 memory_save_interval=5,
                 model_load_interval=5,
                 cuda=True,
                 seed=0):

        self.actor_id = actor_id
        self.env = env
        torch.manual_seed(seed)
        np.random.seed(seed)
        self.env.seed(seed)

        self.shared_memory = shared_memory
        self.shared_weights = shared_weights

        self.device = torch.device(
            "cuda" if cuda and torch.cuda.is_available() else "cpu")

        self.policy = ConvCategoricalPolicy(
            self.env.observation_space.shape[0],
            self.env.action_space.n).to(self.device).eval()
        self.critic = TwinedDiscreteConvQNetwork(
            self.env.observation_space.shape[0],
            self.env.action_space.n).to(self.device).eval()
        self.critic_target = TwinedDiscreteConvQNetwork(
            self.env.observation_space.shape[0],
            self.env.action_space.n).to(self.device).eval()
        hard_update(self.critic_target, self.critic)

        if per:
            self.memory = DummyPrioritizedMemory(
                memory_size,
                self.env.observation_space.shape, (1, ),
                self.device,
                gamma,
                multi_step,
                alpha=alpha,
                beta=beta,
                beta_annealing=beta_annealing)
        else:
            self.memory = DummyMultiStepMemory(
                memory_size, self.env.observation_space.shape, (1, ),
                self.device, gamma, multi_step)

        self.log_dir = log_dir
        self.summary_dir = os.path.join(log_dir, 'summary',
                                        f'actor-{self.actor_id}')
        if not os.path.exists(self.summary_dir):
            os.makedirs(self.summary_dir)
        self.writer = SummaryWriter(log_dir=self.summary_dir)

        self.episodes = 0
        self.steps = 0
        self.per = per
        self.multi_step = multi_step
        self.start_steps = start_steps
        self.gamma_n = gamma**multi_step
        self.log_interval = log_interval
        self.memory_save_interval = memory_save_interval
        self.model_load_interval = model_load_interval

        load = False
        while load is False:
            load = self.load_weights()
Esempio n. 3
0
class ApexLearner(ApexAgent):
    def __init__(self,
                 env,
                 log_dir,
                 shared_memory,
                 shared_weights,
                 batch_size=64,
                 lr=0.00025 / 4,
                 memory_size=4e5,
                 gamma=0.99,
                 multi_step=3,
                 alpha=0.4,
                 update_per_steps=32,
                 start_steps=1000,
                 beta=0.6,
                 beta_annealing=0.0,
                 grad_clip=5.0,
                 log_interval=10,
                 memory_load_interval=5,
                 model_save_interval=5,
                 target_update_interval=100,
                 eval_interval=1000,
                 cuda=True,
                 seed=0):
        self.env = env
        torch.manual_seed(seed)
        np.random.seed(seed)
        self.env.seed(seed)

        self.shared_memory = shared_memory
        self.shared_weights = shared_weights

        self.device = torch.device(
            "cuda" if cuda and torch.cuda.is_available() else "cpu")

        self.net = DiscreteConvQNetwork(self.env.observation_space.shape[0],
                                        self.env.action_space.n).to(
                                            self.device)
        self.target_net = DiscreteConvQNetwork(
            self.env.observation_space.shape[0],
            self.env.action_space.n).to(self.device)
        hard_update(self.target_net, self.net)

        self.optim = optim.Adam(self.net.parameters(), lr=lr)
        self.save_weights()

        self.memory = DummyPrioritizedMemory(memory_size,
                                             self.env.observation_space.shape,
                                             (1, ),
                                             self.device,
                                             gamma,
                                             multi_step,
                                             alpha=alpha,
                                             beta=beta,
                                             beta_annealing=beta_annealing)

        self.log_dir = log_dir
        self.model_dir = os.path.join(log_dir, 'model')
        self.summary_dir = os.path.join(log_dir, 'summary', 'leaner')
        if not os.path.exists(self.model_dir):
            os.makedirs(self.model_dir)
        if not os.path.exists(self.summary_dir):
            os.makedirs(self.summary_dir)
        self.writer = SummaryWriter(log_dir=self.summary_dir)

        self.steps = 0
        self.epochs = 0
        self.batch_size = batch_size
        self.start_steps = start_steps
        self.gamma_n = gamma**multi_step
        self.update_per_steps = update_per_steps
        self.grad_clip = grad_clip
        self.log_interval = log_interval
        self.memory_load_interval = memory_load_interval
        self.model_save_interval = model_save_interval
        self.target_update_interval = target_update_interval
        self.eval_interval = eval_interval

    def run(self):
        while len(self.memory) <= self.start_steps:
            self.load_memory()

        self.time = time()
        while True:
            self.epochs += 1
            for _ in range(self.update_per_steps):
                self.steps += 1
                self.learn()
                self.interval()

    def learn(self):
        batch, indices, weights = \
            self.memory.sample(self.batch_size)

        curr_q = self.calc_current_q(*batch)
        target_q = self.calc_target_q(*batch)
        loss = torch.mean((curr_q - target_q).pow(2) * weights)

        update_params(self.optim, self.net, loss, self.grad_clip)

        errors = torch.abs(curr_q.detach() - target_q).cpu().numpy()
        self.memory.update_priority(indices, errors)

        if self.steps % self.log_interval == 0:
            self.writer.add_scalar("loss/learner",
                                   loss.detach().item(), self.steps)
            self.writer.add_scalar("stats/mean_Q",
                                   curr_q.detach().mean().item(), self.steps)

    def interval(self):
        if self.steps % self.eval_interval == 0:
            self.evaluate()
        if self.steps % self.memory_load_interval == 0:
            self.load_memory()
        if self.steps % self.model_save_interval == 0:
            self.save_weights()
            self.save_models()
        if self.steps % self.target_update_interval == 0:
            hard_update(self.target_net, self.net)

    def evaluate(self):
        episodes = 10
        returns = np.zeros((episodes, ), dtype=np.float32)
        action_bar = np.zeros((self.env.action_space.n), np.int)

        for i in range(episodes):
            state = self.env.reset()
            episode_reward = 0.
            done = False
            while not done:
                action = self.exploit(state)
                next_state, reward, done, _ = self.env.step(action)
                action_bar[action] += 1
                episode_reward += reward
                state = next_state

            returns[i] = episode_reward

        mean_return = np.mean(returns)
        std_return = np.std(returns)

        self.writer.add_scalar('reward/test', mean_return, self.steps)
        now = time()
        print('Learer  '
              f'Num steps: {self.steps:<5} '
              f'reward: {mean_return:<5.1f}+/- {std_return:<5.1f}  '
              f'time: {now - self.time:<3.3f}')
        self.time = now

    def save_models(self):
        self.net.save(os.path.join(self.model_dir, 'net.pth'))
        self.target_net.save(os.path.join(self.model_dir, 'target_net.pth'))
Esempio n. 4
0
    def __init__(self,
                 env,
                 log_dir,
                 shared_memory,
                 shared_weights,
                 batch_size=64,
                 lr=0.00025 / 4,
                 memory_size=4e5,
                 gamma=0.99,
                 multi_step=3,
                 alpha=0.4,
                 update_per_steps=32,
                 start_steps=1000,
                 beta=0.6,
                 beta_annealing=0.0,
                 grad_clip=5.0,
                 log_interval=10,
                 memory_load_interval=5,
                 model_save_interval=5,
                 target_update_interval=100,
                 eval_interval=1000,
                 cuda=True,
                 seed=0):
        self.env = env
        torch.manual_seed(seed)
        np.random.seed(seed)
        self.env.seed(seed)

        self.shared_memory = shared_memory
        self.shared_weights = shared_weights

        self.device = torch.device(
            "cuda" if cuda and torch.cuda.is_available() else "cpu")

        self.net = DiscreteConvQNetwork(self.env.observation_space.shape[0],
                                        self.env.action_space.n).to(
                                            self.device)
        self.target_net = DiscreteConvQNetwork(
            self.env.observation_space.shape[0],
            self.env.action_space.n).to(self.device)
        hard_update(self.target_net, self.net)

        self.optim = optim.Adam(self.net.parameters(), lr=lr)
        self.save_weights()

        self.memory = DummyPrioritizedMemory(memory_size,
                                             self.env.observation_space.shape,
                                             (1, ),
                                             self.device,
                                             gamma,
                                             multi_step,
                                             alpha=alpha,
                                             beta=beta,
                                             beta_annealing=beta_annealing)

        self.log_dir = log_dir
        self.model_dir = os.path.join(log_dir, 'model')
        self.summary_dir = os.path.join(log_dir, 'summary', 'leaner')
        if not os.path.exists(self.model_dir):
            os.makedirs(self.model_dir)
        if not os.path.exists(self.summary_dir):
            os.makedirs(self.summary_dir)
        self.writer = SummaryWriter(log_dir=self.summary_dir)

        self.steps = 0
        self.epochs = 0
        self.batch_size = batch_size
        self.start_steps = start_steps
        self.gamma_n = gamma**multi_step
        self.update_per_steps = update_per_steps
        self.grad_clip = grad_clip
        self.log_interval = log_interval
        self.memory_load_interval = memory_load_interval
        self.model_save_interval = model_save_interval
        self.target_update_interval = target_update_interval
        self.eval_interval = eval_interval
Esempio n. 5
0
class ApexActor(ApexAgent):
    space_size = 65

    def __init__(self,
                 env,
                 log_dir,
                 shared_memory,
                 shared_weights,
                 actor_id,
                 num_actors,
                 memory_size=1e4,
                 gamma=0.99,
                 multi_step=3,
                 alpha=0.4,
                 beta=0.6,
                 beta_annealing=0.0,
                 log_interval=10,
                 memory_save_interval=5,
                 model_load_interval=5,
                 cuda=True,
                 seed=0):

        self.actor_id = actor_id
        self.env = env
        torch.manual_seed(seed)
        np.random.seed(seed)
        self.env.seed(seed)

        self.shared_memory = shared_memory
        self.shared_weights = shared_weights

        self.device = torch.device(
            "cuda" if cuda and torch.cuda.is_available() else "cpu")

        self.net = DiscreteConvQNetwork(self.env.observation_space.shape[0],
                                        self.env.action_space.n).to(
                                            self.device).eval()
        self.target_net = DiscreteConvQNetwork(
            self.env.observation_space.shape[0],
            self.env.action_space.n).to(self.device).eval()
        hard_update(self.target_net, self.net)

        self.memory = DummyPrioritizedMemory(memory_size,
                                             self.env.observation_space.shape,
                                             (1, ),
                                             self.device,
                                             gamma,
                                             multi_step,
                                             alpha=alpha,
                                             beta=beta,
                                             beta_annealing=beta_annealing)

        self.log_dir = log_dir
        self.summary_dir = os.path.join(log_dir, 'summary',
                                        f'actor-{self.actor_id}')
        if not os.path.exists(self.summary_dir):
            os.makedirs(self.summary_dir)
        self.writer = SummaryWriter(log_dir=self.summary_dir)

        if num_actors > 1:
            self.epsilon = 0.4**(1 + actor_id * 7 / (num_actors - 1))
        else:
            self.epsilon = 0.4

        self.episodes = 0
        self.steps = 0
        self.gamma_n = gamma**multi_step
        self.log_interval = log_interval
        self.memory_save_interval = memory_save_interval
        self.model_load_interval = model_load_interval

        load = False
        while load is False:
            load = self.load_weights()

    def run(self):
        self.time = time()
        while True:
            self.episodes += 1
            self.act_episode()
            self.interval()

    def act_episode(self):
        episode_reward = 0.
        episode_steps = 0
        done = False
        state = self.env.reset()

        while not done:
            action = self.act(state)
            next_state, reward, done, _ = self.env.step(action)
            self.steps += 1
            episode_steps += 1
            episode_reward += reward

            # ignore 'done' when hitting the time horizon
            if episode_steps >= self.env._max_episode_steps:
                masked_done = False
            else:
                masked_done = done

            batch = to_batch(state, action, reward, next_state, masked_done,
                             self.device)
            with torch.no_grad():
                curr_q = self.calc_current_q(*batch)
            target_q = self.calc_target_q(*batch)
            error = torch.abs(curr_q - target_q).item()

            self.memory.append(state,
                               action,
                               reward,
                               next_state,
                               masked_done,
                               error,
                               episode_done=done)

            state = next_state

        if self.episodes % self.log_interval == 0:
            self.writer.add_scalar('reward/train', episode_reward, self.steps)
        now = time()
        print(
            ' ' * self.space_size, f'Actor {self.actor_id:<2}  '
            f'episode: {self.episodes:<4}  '
            f'episode steps: {episode_steps:<4}  '
            f'reward: {episode_reward:<5.1f}  '
            f'time: {now - self.time:3.3f}')
        self.time = now

    def interval(self):
        if self.episodes % self.model_load_interval == 0:
            self.load_weights()
        if self.episodes % self.memory_save_interval == 0:
            self.save_memory()