Beispiel #1
0
    def __init__(self, env: str):
        self.device = torch.device("cuda")

        self.env = DeepmindHackWrapper(gym.make(env), NOOP_MAX)
        self.noop_index = self.env.unwrapped.get_action_meanings().index(
            "NOOP")
        self.n_actions = self.env.action_space.n
        self.img_shape = self.preprocess_frame(self.env.reset()).shape

        self.memory = ReplayBuffer(self.img_shape,
                                   REPLY_BUFFER_SIZE,
                                   discount_factor=gamma)

        self.game_steps = 0
        self.rand_fill()

        self.loader = torch.utils.data.DataLoader(self.memory,
                                                  batch_size=BATCH_SIZE,
                                                  pin_memory=True,
                                                  num_workers=0)
        self.net = AtariNet([4, *self.img_shape],
                            self.n_actions).to(self.device)
        self.loss = torch.nn.SmoothL1Loss().to(self.device)
        # self.optimizer = torch.optim.RMSprop(self.net.parameters(), lr=0.00025, eps=0.01, alpha=0.95, centered=True)
        self.optimizer = torch.optim.Adam(self.net.parameters(), lr=0.00025)

        self.loss_sum = 0
        self.loss_cnt = 0
        self.last_test_time = time.time()
        self.last_video_time = 0
        self.copy_network()

        self.prefetch_queue = Queue(maxsize=1)
        self.loader_thread = threading.Thread(target=self.loader_thread)
        self.loader_thread.start()
Beispiel #2
0
    def __init__(self, *args, agent=None, target_agent=None, **kwargs):
        self.agent = agent
        self.target_agent = target_agent
        # hard update
        self.hard_update(self.target_agent, self.agent)
        self.replay_buffer = ReplayBuffer(buffer_size=int(kwargs['buffer_size']), minibatch_size=kwargs['minibatch_size'],
                                          seed=kwargs['seed'], device=kwargs['device'])

        self.__minibatch = kwargs['minibatch_size']

        self.actor_optim = torch.optim.Adam(self.agent.get_actor_parameters(), lr=kwargs['learning_rate'])
        self.critic_optim = torch.optim.Adam(self.agent.get_critic_parameters(), lr=kwargs['learning_rate'])

        self.__discount = kwargs['discount']
        self.__epsilon = kwargs['epsilon']
        self.__tau = kwargs['tau']
        return
Beispiel #3
0
 def __init__(
     self,
     env,
     actor: Actor,
     critic: Critic,
     actor_target: Actor,
     critic_target: Critic,
     gamma: float,
     minibatch_size: int,
     device: torch.device,
     max_episodes: int,
     tau: int,
     actor_lr: float,
     critic_lr: float,
     weight_decay: float,
     replay_buffer_size: int,
     models_path: str,
     runs_path: Optional[str],
 ):
     self.env = env
     self.actor = actor
     self.actor_target = actor_target
     self.critic = critic
     self.critic_target = critic_target
     self.gamma = gamma
     self.minibatch_size = minibatch_size
     self.device = device
     self.max_episodes = max_episodes
     self.tau = tau
     self.actor_optimizer = Adam(self.actor.parameters(), lr=actor_lr)
     self.critic_optimizer = Adam(
         self.critic.parameters(), lr=critic_lr, weight_decay=weight_decay,
     )
     self.critic_loss_fn = nn.MSELoss()
     self.replay_buffer = ReplayBuffer(replay_buffer_size)
     self.models_path = models_path
     self.min_act_value = env.action_space.low[0]
     self.max_act_value = env.action_space.high[0]
     self.writer = SummaryWriter(log_dir=runs_path)
     self.actor_target.eval()
     self.critic_target.eval()
     self.episode_i = 0
    def __init__(self, id, state_size, action_size, config = Config()):
        """Initialize an Agent object.
        
        Params
        ======
            id (int): id used to identify the agent
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            config (Config): the agents configuration
        """
        self.state_size = state_size
        self.action_size = action_size
        self.id = id

        self.t_step = 0

        self.config = config

        random.seed(config.random_seed)

        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        # Actor & Target Network 
        self.actor_local = Actor(state_size, action_size, config.random_seed, config.actor_hidden_units, config.use_bn).to(self.device)
        self.actor_target = Actor(state_size, action_size, config.random_seed, config.actor_hidden_units, config.use_bn).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=config.lr_actor)

        # Critic & Target Network
        self.critic_local = Critic(state_size, action_size, config.random_seed, config.critic_hidden_units, config.use_bn).to(self.device)
        self.critic_target = Critic(state_size, action_size, config.random_seed, config.critic_hidden_units, config.use_bn).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=config.lr_critic, weight_decay=config.weight_decay)

        # Noise process
        self.noise = OUNoise(action_size, config.random_seed, config.noise_mu, config.noise_theta, config.noise_sigma)
        
        # Replay memory
        if config.use_per:
            self.memory = NaivePrioritizedReplayBuffer(action_size, config.buffer_size, config.batch_size, config.random_seed, config.per_alpha,config.per_epsilon)
        else:
            self.memory = ReplayBuffer(action_size, config.buffer_size, config.batch_size, config.random_seed)
 def __init__(self, env, args):
     self.env = env
     self.memory_buffer = ReplayBuffer(args.buffer_size)
     self.learning_rate_actor = args.lr_actor
     self.learning_rate_critic = args.lr_critic
     self.tau = args.TAU
     self.batch_size = args.batch_size
     self.discount = args.discount
     self.states_ph = tf.placeholder(tf.float32, shape=(None, 1))
     self.actions_ph = tf.placeholder(tf.float32,
                                      shape=((None, ) +
                                             self.env.action_space.shape))
     self.is_training_ph = tf.placeholder_with_default(True, shape=None)
     self.Actor = ActorNetwork(env=self.env,
                               states=self.states_ph,
                               LR=self.learning_rate_actor,
                               TAU=self.tau,
                               discount=self.discount,
                               scope="actor_main",
                               batch_size=self.batch_size,
                               is_training=self.is_training_ph)
     self.Critic = CriticNetwork(env=self.env,
                                 states=self.states_ph,
                                 actions=self.actions_ph,
                                 LR=self.learning_rate_critic,
                                 TAU=self.tau,
                                 discount=self.discount,
                                 scope="critic_main",
                                 batch_size=self.batch_size,
                                 is_training=self.is_training_ph)
     self.Actor_target = ActorNetwork(env=self.env,
                                      states=self.states_ph,
                                      LR=self.learning_rate_actor,
                                      TAU=self.tau,
                                      discount=self.discount,
                                      scope="actor_target",
                                      batch_size=self.batch_size,
                                      is_training=self.is_training_ph)
     self.Critic_target = CriticNetwork(env=self.env,
                                        states=self.states_ph,
                                        actions=self.actions_ph,
                                        LR=self.learning_rate_critic,
                                        TAU=self.tau,
                                        discount=self.discount,
                                        scope="critic_target",
                                        batch_size=self.batch_size,
                                        is_training=self.is_training_ph)
Beispiel #6
0
class DQN:
    grayscale_coeffs = np.asarray([0.11, 0.59, 0.3], dtype=np.float32)

    @staticmethod
    def preprocess_frame(frame: np.ndarray) -> np.ndarray:
        return (frame[::2, ::2].astype(np.float32) *
                DQN.grayscale_coeffs).sum(-1).astype(np.uint8)

    @staticmethod
    def frame_to_nn(frame: torch.Tensor) -> torch.Tensor:
        return frame.float() / 255.0

    def __init__(self, env: str):
        self.device = torch.device("cuda")

        self.env = DeepmindHackWrapper(gym.make(env), NOOP_MAX)
        self.noop_index = self.env.unwrapped.get_action_meanings().index(
            "NOOP")
        self.n_actions = self.env.action_space.n
        self.img_shape = self.preprocess_frame(self.env.reset()).shape

        self.memory = ReplayBuffer(self.img_shape,
                                   REPLY_BUFFER_SIZE,
                                   discount_factor=gamma)

        self.game_steps = 0
        self.rand_fill()

        self.loader = torch.utils.data.DataLoader(self.memory,
                                                  batch_size=BATCH_SIZE,
                                                  pin_memory=True,
                                                  num_workers=0)
        self.net = AtariNet([4, *self.img_shape],
                            self.n_actions).to(self.device)
        self.loss = torch.nn.SmoothL1Loss().to(self.device)
        # self.optimizer = torch.optim.RMSprop(self.net.parameters(), lr=0.00025, eps=0.01, alpha=0.95, centered=True)
        self.optimizer = torch.optim.Adam(self.net.parameters(), lr=0.00025)

        self.loss_sum = 0
        self.loss_cnt = 0
        self.last_test_time = time.time()
        self.last_video_time = 0
        self.copy_network()

        self.prefetch_queue = Queue(maxsize=1)
        self.loader_thread = threading.Thread(target=self.loader_thread)
        self.loader_thread.start()

    def loader_thread(self):
        while True:
            for d in self.loader:
                self.prefetch_queue.put(
                    {k: v.to(self.device)
                     for k, v in d.items()})

    def play(self,
             get_action: Callable[[int, List[np.ndarray]], int],
             train: bool,
             step_hook=lambda: None,
             maxlen=MAXLEN):
        total_reward = 0
        all_frames = []

        while True:
            observation = self.preprocess_frame(self.env.reset())
            all_frames += [observation] * 4

            for t in range(maxlen):
                action = get_action(t, all_frames[-4:])
                new_frame, reward, done, info = self.env.step(action)
                if train:
                    self.memory.add(observation, action, reward, done)
                    self.game_steps += 1

                observation = self.preprocess_frame(new_frame)
                all_frames.append(observation)
                total_reward += reward

                step_hook()
                if done:
                    break

            if train or self.env.was_done:
                break

        return total_reward, all_frames

    def render_video(self, all_frames: List[np.ndarray]) -> np.ndarray:
        return np.stack(all_frames, axis=0)[:, np.newaxis]

    def rand_fill(self):
        print("Filling the replay buffer with random data")
        while self.memory.count < PREFILL:
            print("Starting new episode. Data so far:", self.memory.count)
            _, frames = self.play(
                lambda i, observation: self.env.action_space.sample(), True)
        self.game_steps = 0
        print("Prefill completed.")

    def log_loss(self, loss: float):
        self.loss_sum += loss
        self.loss_cnt += 1
        if self.loss_cnt == 100:
            wandb.log({"loss": self.loss_sum / self.loss_cnt},
                      step=self.game_steps)
            self.loss_sum = 0
            self.loss_cnt = 0

    def copy_network(self):
        self.target_init_step = self.game_steps
        self.predictor = deepcopy(self.net)
        self.predictor.eval()

    def train_step(self):
        data = self.prefetch_queue.get()

        action = data["action"].long()
        frames = self.frame_to_nn(data["frames"])

        pred = self.net(frames[:, :-1])
        pred = pred.gather(index=action, dim=1)
        with torch.no_grad():
            next_value, _ = self.net(frames[:, 1:]).max(-1, keepdim=True)

        target = gamma * next_value * (
            1.0 - data["is_done"].float()) + data["reward"]
        l = self.loss(pred, target)

        self.optimizer.zero_grad()
        l.backward()
        torch.nn.utils.clip_grad_norm_(self.net.parameters(), 1.0)
        self.optimizer.step()

        self.log_loss(l.item())
        if self.game_steps - self.target_init_step > TARGET_SWITCH * STEPS_PER_TRAIN:
            self.copy_network()

    def get_epsilon(self) -> float:
        e_start = 1.0
        e_end = 0.1
        n = 1000000.0

        return max(e_start - (e_start - e_end) / n * self.game_steps, e_end)

    def get_action(self,
                   iteration: int,
                   observations: List[np.ndarray],
                   train: bool = True) -> int:
        if train and (np.random.random() < self.get_epsilon()):
            return self.env.action_space.sample()
        else:
            with torch.no_grad():
                observation = np.stack(observations, axis=0)
                input = self.frame_to_nn(
                    torch.tensor(observation, device=self.device).unsqueeze(0))
                pred = self.net(input)
                _, amax = pred[0].max(-1)
                return amax.item()

    def train(self):
        while True:

            def do_train():
                if self.game_steps % STEPS_PER_TRAIN == 0:
                    self.train_step()

            log = {}
            log["epsilon"] = self.get_epsilon()
            log["train_reward"], frames = self.play(self.get_action,
                                                    train=True,
                                                    step_hook=do_train)
            print(
                f"Step {self.game_steps}: Episode completed in {len(frames)} steps. Reward: {log['train_reward']}. Epsilon: {log['epsilon']}"
            )
            frames = None

            now = time.time()
            if now - self.last_test_time > 60:
                log["test_reward"], frames = self.play(
                    lambda i, observation: self.get_action(
                        i, observation, train=False),
                    train=False)
                self.last_test_time = now
                print(
                    f"--> TEST: Step {self.game_steps}: Episode completed in {len(frames)} stpes. Reward: {log['test_reward']}"
                )

                if now - self.last_video_time > 10 * 60:
                    log["video"] = wandb.Video(self.render_video(
                        frames[-300:]),
                                               fps=10)
                    self.last_video_time = now

                frames = None

            wandb.log(log, step=self.game_steps)
Beispiel #7
0
class DDPG:
    def __init__(self, *args, agent=None, target_agent=None, **kwargs):
        self.agent = agent
        self.target_agent = target_agent
        # hard update
        self.hard_update(self.target_agent, self.agent)
        self.replay_buffer = ReplayBuffer(
            buffer_size=int(kwargs['buffer_size']),
            minibatch_size=kwargs['minibatch_size'],
            seed=kwargs['seed'],
            device=kwargs['device'])

        self.__minibatch = kwargs['minibatch_size']

        self.actor_optim = torch.optim.Adam(self.agent.get_actor_parameters(),
                                            lr=kwargs['actor_lr'])
        self.critic_optim = torch.optim.Adam(
            self.agent.get_critic_parameters(), lr=kwargs['critic_lr'])

        self.__discount = kwargs['discount']
        self.__tau = kwargs['tau']
        return

    def soft_update(self, target, source, tau):
        """
        Copies the parameters from source network (x) to target network (y) using the below update
        y = TAU*x + (1 - TAU)*y
        :param target: Target network (PyTorch)
        :param source: Source network (PyTorch)
        :return:
        """
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - tau) +
                                    param.data * tau)

    def hard_update(self, target, source):
        """
        Copies the parameters from source network to target network
        :param target: Target network (PyTorch)
        :param source: Source network (PyTorch)
        :return:
        """
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)

    def train(self, env, num_episodes):
        """
        Train the agent to solve environment
        :param env: environment object (ReacherEnvironment)
        :param num_episodes: number of episodes (int)
        :return scores: list of scores for each episode (list)
        """
        noise_gen = OrnsteinUhlenbeckActionNoise(env.get_action_dim())
        noise_gen.reset()
        mean_score = []
        scores = []
        for episode in range(num_episodes):
            state = env.reset(train_mode=True)
            # roll out
            j = 0
            score = 0
            while True:
                # step
                action = self.agent.act(
                    torch.Tensor(state)).detach().cpu().numpy()
                noise = [
                    noise_gen.sample() for _ in range(env.get_num_agents())
                ]
                noised_action = action + noise
                noised_action = np.clip(noised_action, -1., 1.)
                next_state, reward, done = env.step(noised_action.squeeze())

                score += np.mean(reward)

                # add experience to replay buffer
                for i in range(action.shape[0]):
                    self.replay_buffer.add(state[i], action[i], reward[i],
                                           next_state[i], done[i])

                state = next_state

                if self.replay_buffer.size() < self.__minibatch:
                    continue

                # sample minibatch
                states, actions, rewards, next_states, dones = self.replay_buffer.sample(
                )
                # compute critic loss
                target_actions = self.target_agent.act(next_states)
                target_Q = rewards + self.__discount * self.target_agent.Q(
                    next_states, target_actions) * (1 - dones)
                Q = self.agent.Q(states, actions)
                critic_loss = (Q - target_Q).pow(2).mean()
                # update critic
                self.critic_optim.zero_grad()
                critic_loss.backward()
                self.critic_optim.step()

                # compute actor objective
                actor_actions = self.agent.act(states)
                Q = self.agent.Q(states, actor_actions)
                actor_objective = -Q.mean()
                # update actor
                self.actor_optim.zero_grad()
                actor_objective.backward()
                self.actor_optim.step()

                # soft update of target agent
                self.soft_update(self.target_agent, self.agent, self.__tau)

                if np.any(done):
                    break

            print("episode: {:d} | score: {:.4f}".format(episode, score))
            scores.append(score)
        return scores
Beispiel #8
0
# Noise processes
noise_process1 = OUNoise(action_size,
                         random_seed,
                         mu=0.,
                         theta=0.15,
                         sigma=0.1)
noise_process2 = OUNoise(action_size,
                         random_seed,
                         mu=0.,
                         theta=0.15,
                         sigma=0.1)
noise_processes = [noise_process1, noise_process2]

# Replay buffer
replay_buffer = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed)

####################################################################################################

# train the Agent
max_episodes = 2000
target_score = 0.5

model_directory = 'resources/models/'
actor_checkpoint_file = model_directory + "checkpoint_actor.pth"
critic_checkpoint_file = model_directory + "checkpoint_critic.pth"
actor_model_file = model_directory + "model_actor.pt"
critic_model_file = model_directory + "model_critic.pt"

agent = SelfPlayAgent(actor_local, actor_target, critic_local, critic_target,
                      noise_processes, replay_buffer)
Beispiel #9
0
class DDPG:
    def __init__(
        self,
        env,
        actor: Actor,
        critic: Critic,
        actor_target: Actor,
        critic_target: Critic,
        gamma: float,
        minibatch_size: int,
        device: torch.device,
        max_episodes: int,
        tau: int,
        actor_lr: float,
        critic_lr: float,
        weight_decay: float,
        replay_buffer_size: int,
        models_path: str,
        runs_path: Optional[str],
    ):
        self.env = env
        self.actor = actor
        self.actor_target = actor_target
        self.critic = critic
        self.critic_target = critic_target
        self.gamma = gamma
        self.minibatch_size = minibatch_size
        self.device = device
        self.max_episodes = max_episodes
        self.tau = tau
        self.actor_optimizer = Adam(self.actor.parameters(), lr=actor_lr)
        self.critic_optimizer = Adam(
            self.critic.parameters(), lr=critic_lr, weight_decay=weight_decay,
        )
        self.critic_loss_fn = nn.MSELoss()
        self.replay_buffer = ReplayBuffer(replay_buffer_size)
        self.models_path = models_path
        self.min_act_value = env.action_space.low[0]
        self.max_act_value = env.action_space.high[0]
        self.writer = SummaryWriter(log_dir=runs_path)
        self.actor_target.eval()
        self.critic_target.eval()
        self.episode_i = 0

    def compute_expected_return_target(self, rewards, next_states, dones):
        """
        Compute the expected return obtained by evaluating
        the critic_target on the actor_target's policy.
        """
        with torch.no_grad():
            target_expectation = self.critic_target(
                next_states, self.actor_target(next_states)
            )
            expected_return_target = (
                rewards + (1 - dones) * self.gamma * target_expectation
            )
            return expected_return_target

    def update_critic(self, states, actions, next_states, rewards, dones):
        """Update the critic network by minimizing the difference
        with the target critic"""
        self.critic_optimizer.zero_grad()
        expected_return_target = self.compute_expected_return_target(
            rewards, next_states, dones
        )
        expected_return_pred = self.critic(states, actions)
        critic_loss = self.critic_loss_fn(
            expected_return_pred, expected_return_target
        )
        critic_loss.backward()
        self.critic_optimizer.step()
        return critic_loss

    def update_actor(self, states):
        """Update the actor by maximizing the expected return."""
        self.actor_optimizer.zero_grad()
        self.critic.eval()
        actor_loss = -self.critic(states, self.actor(states)).mean()
        self.critic.train()
        actor_loss.backward()
        self.actor_optimizer.step()
        return actor_loss

    def update_target_networks(self):
        """Soft update the target networks"""
        with torch.no_grad():
            for p_a, p_a_target in zip(
                self.actor.parameters(), self.actor_target.parameters()
            ):
                p_a_target.data.mul_((1.0 - self.tau))
                p_a_target.data.add_(self.tau * p_a.data)

            for p_c, p_c_target in zip(
                self.critic.parameters(), self.critic_target.parameters()
            ):
                p_c_target.data.mul_(1.0 - self.tau)
                p_c_target.data.add_(self.tau * p_c.data)

    def get_minibatch(self):
        """Return `minibatch_size` (state, action, next_state, reward, done)
        tuples as Tensors."""
        minibatch = self.replay_buffer.get(self.minibatch_size)
        states = torch.stack([mb.state for mb in minibatch])[:, -1, :]
        actions = [mb.action for mb in minibatch]
        actions = torch.tensor(actions, device=self.device)[:, -1, :]
        next_states = [mb.next_state for mb in minibatch]
        next_states = torch.stack(next_states)[:, -1, :]
        rewards = [[mb.reward] for mb in minibatch]
        rewards = torch.tensor(rewards, device=self.device)
        dones = [[int(mb.done)] for mb in minibatch]
        dones = torch.tensor(dones, device=self.device)
        return states, actions, next_states, rewards, dones

    def save_models(self):
        """Save the current models to disk."""
        torch.save(self.critic, self.models_path + "critic")
        torch.save(self.actor, self.models_path + "actor")
        torch.save(self.critic_target, self.models_path + "critic_target")
        torch.save(self.actor_target, self.models_path + "actor_target")

    def select_action(self, state, explore=True):
        self.actor.eval()
        with torch.no_grad():
            action = self.actor(state).to("cpu").data.numpy()
        self.actor.train()
        if explore:
            action += noise(self.env.action_space.shape)
            action = action.clip(self.min_act_value, self.max_act_value)
        return action

    def log(self, sum_of_actor_losses, sum_of_critic_losses, reward):
        if self.episode_i % 20 == 0:
            self.save_models()
        self.writer.add_scalar(
            "ActorLoss/train", sum_of_actor_losses, self.episode_i
        )
        self.writer.add_scalar(
            "CriticLoss/train", sum_of_critic_losses, self.episode_i
        )
        self.writer.add_scalar("Reward/train", reward, self.episode_i)

    def run_episode(self, explore=True):
        """Run a single episode in either exploration (explore=True)
        or exploitation (explore=False) mode."""
        self.episode_i += 1
        state = to_tensor_variable([self.env.reset()])
        t = 0
        done = False
        sum_of_actor_losses = 0
        sum_of_critic_losses = 0

        while not done:
            t += 1
            with torch.no_grad():
                action = self.select_action(state, explore)
            next_state, reward, done, _ = self.env.step(action[0])
            next_state = to_tensor_variable([next_state])
            self.replay_buffer.store(
                Transition(state, action, next_state, reward, done)
            )
            state = next_state

            if explore and self.replay_buffer.occupied > self.minibatch_size:
                (
                    states,
                    actions,
                    next_states,
                    rewards,
                    dones,
                ) = self.get_minibatch()
                sum_of_critic_losses += self.update_critic(
                    states, actions, next_states, rewards, dones
                )
                self.critic.eval()
                sum_of_actor_losses += self.update_actor(states)
                self.critic.train()
                self.update_target_networks()

            if done:
                self.log(sum_of_actor_losses, sum_of_critic_losses, reward)
                return reward

    def run_random_episodes(self, n_episodes):
        for _ in range(n_episodes):
            state = to_tensor_variable([self.env.reset()])
            done = False
            while not done:
                action = np.array([self.env.action_space.sample()])
                next_state, reward, done, _ = self.env.step(action[0])
                next_state = to_tensor_variable([next_state])
                self.replay_buffer.store(
                    Transition(state, action, next_state, reward, done)
                )

    def exploit(self, n_episodes):
        """Exploits for n_episodes"""
        rewards = [self.run_episode(explore=False) for _ in range(n_episodes)]
        return rewards

    def explore(self, n_episodes):
        """Explores for n_episodes"""
        for _ in range(n_episodes):
            self.run_episode(explore=True)

    def train(self):
        """Train the four networks"""
        self.run_random_episodes(100)
        count_exploit = 0
        count_explore = 0
        while self.episode_i < self.max_episodes:
            self.explore(50)
            count_explore += 50
            rewards = self.exploit(10)
            count_exploit += 10
            s = f"Average final reward after 10 exploitations and "
            s += f"{count_explore} explorations: {sum(rewards)/len(rewards)}\n"
            print(s)
            if sum(rewards) / len(rewards) > 180:
                break
        self.env.close()
        return
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Actor Network (w/ Target Network)
actor_local = Actor(state_size, action_size, random_seed).to(device)
actor_target = Actor(state_size, action_size, random_seed).to(device)

# Critic Network (w/ Target Network)
critic_local = Critic(state_size + action_size, 1, random_seed).to(device)
critic_target = Critic(state_size + action_size, 1, random_seed).to(device)

# Noise process
noise_process = OUNoise(action_size, random_seed)

# Replay memory
memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed, device)

####################################################################################################

# train the Agent
max_episodes = 300
max_timesteps = 1000


def ddpg(n_episodes=max_episodes, n_timesteps=max_timesteps):
    """Deep Deterministic Policy Gradient.

    Args:
        n_episodes (int): maximum number of training episodes
        n_timesteps (int): maximum number of timesteps per episode
    """
class Agent():
    """Agent that interacts with and learns from the environment."""

    def __init__(self, id, state_size, action_size, config = Config()):
        """Initialize an Agent object.
        
        Params
        ======
            id (int): id used to identify the agent
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            config (Config): the agents configuration
        """
        self.state_size = state_size
        self.action_size = action_size
        self.id = id

        self.t_step = 0

        self.config = config

        random.seed(config.random_seed)

        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        # Actor & Target Network 
        self.actor_local = Actor(state_size, action_size, config.random_seed, config.actor_hidden_units, config.use_bn).to(self.device)
        self.actor_target = Actor(state_size, action_size, config.random_seed, config.actor_hidden_units, config.use_bn).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=config.lr_actor)

        # Critic & Target Network
        self.critic_local = Critic(state_size, action_size, config.random_seed, config.critic_hidden_units, config.use_bn).to(self.device)
        self.critic_target = Critic(state_size, action_size, config.random_seed, config.critic_hidden_units, config.use_bn).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=config.lr_critic, weight_decay=config.weight_decay)

        # Noise process
        self.noise = OUNoise(action_size, config.random_seed, config.noise_mu, config.noise_theta, config.noise_sigma)
        
        # Replay memory
        if config.use_per:
            self.memory = NaivePrioritizedReplayBuffer(action_size, config.buffer_size, config.batch_size, config.random_seed, config.per_alpha,config.per_epsilon)
        else:
            self.memory = ReplayBuffer(action_size, config.buffer_size, config.batch_size, config.random_seed)
    
    def step(self, state, action, reward, next_state, done, beta=None):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every n time steps.
        self.t_step = (self.t_step + 1) % self.config.update_n_step
        if self.t_step != 0:
            return

        # If enough samples are available in memory, get random subset and learn
        if len(self.memory) > self.config.batch_size:
            if self.config.use_per:
                assert(beta != None)
                experiences, weights = self.memory.sample(beta)
                states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(self.device)
                actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(self.device)
                rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(self.device)
                next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(self.device)
                dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(self.device)
                weights = torch.from_numpy(np.vstack(weights)).float().to(self.device)

                experiences = (states, actions, rewards, next_states, dones)
                self.learn(experiences, self.config.gamma, weights)
            else:
                experiences = self.memory.sample()

                states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(self.device)
                actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(self.device)
                rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(self.device)
                next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(self.device)
                dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(self.device)

                experiences = (states, actions, rewards, next_states, dones)
                self.learn(experiences, self.config.gamma)


    def act(self, state):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(self.device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if self.config.add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma, weights=None):
        """
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
            weights (array_like): list of weights for compensation the non-uniform sampling (used only
                                    with prioritized experience replay)
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)

        if self.config.use_per:
            td_error = Q_expected - Q_targets
            critic_loss = (td_error) ** 2
                
            critic_loss = critic_loss * weights
            critic_loss = critic_loss.mean()

            self.memory.update_priorities(np.hstack(td_error.detach().cpu().numpy()))

        else:
            critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ------------------- update target networks ------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.config.tau)
        self.soft_update(self.actor_local, self.actor_target, self.config.tau)                    

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

    def getId(self):
        """ Return the ID of the agent """
        return self.id 

    def summary(self):
        """ Return a brief summary of the agent"""
        s = 'DDPG Agent {}:\n'.format(self.id)
        s += self.config.__str__()
        s += self.actor_local.__str__()
        s += self.critic_local.__str__()
        return s
Beispiel #12
0
#######################################
# Set up simulation
#######################################
env = get_environment(env_input)

explorer = EpsGreedy(num_actions=env_input['NUM_ACTIONS'],
                     eps=config.EPS_START,
                     eps_min=config.EPS_MIN,
                     decay=config.DECAY)

agent = TabularQFunction(state_size=env_input['STATE_SIZE'][0],
                         num_actions=env_input['NUM_ACTIONS'],
                         mu_init=config.Q_INIT,
                         std_init=config.Q_STD)

replay = ReplayBuffer()

for ep in range(config.NUM_EPISODES):
    s = env.reset()

    for t in range(config.NUM_STEPS):
        if (ep % config.RENDER_FREQUENCY == 0) and config.RENDER:
            env.render()

        a = agent.act(s)

        a = explorer.explore(a)

        ss, r, done, info = env.step(a)

        replay.add((s, a, r, ss, done))
Beispiel #13
0
#######################################
if "EXPLORER" in agent_input and agent_input['EXPLORER'] == 'EPS_GREEDY':

    explorer = EpsGreedy(num_actions=env_input['NUM_ACTIONS'],
                         eps=config.EPS_START,
                         eps_min=config.EPS_MIN,
                         decay=config.DECAY)

else:
    explorer = None

learner_agent = get_agent(agent_input, env_input)
print(learner_agent.q)
parameter_server = ParameterServer(learner_agent)

replay = ReplayBuffer(max_size=config.REPLAY_SIZE)

case_id = str(np.random.randint(10000000))
print("Starting experiment {}".format(case_id))
for i in range(args.n_agents):
    id_ = str(i)

    env = get_environment(env_input)

    actor_agent = get_agent(agent_input, env_input)

    writer = EpisodeWriter(config.resultsDir,
                           env_name=env_input['ENV_NAME'],
                           agent_name=agent_input["TYPE"] + case_id,
                           id_=id_)