Esempio n. 1
0
                    default=10,
                    help="The number of worse episodes to show")
args = parser.parse_args()

# Set seed for all randomness sources

utils.seed(args.seed)

# Generate environment

envs = []
for i in range(args.procs):
    env = gym.make(args.env)
    env.seed(args.seed + 10000 * i)
    envs.append(env)
env = ParallelEnv(envs)

# Define agent

model_dir = utils.get_model_dir(args.model)
agent = utils.Agent(args.env, env.observation_space, model_dir, args.argmax,
                    args.procs)
print("CUDA available: {}\n".format(torch.cuda.is_available()))

# Initialize logs

logs = {"num_frames_per_episode": [], "return_per_episode": []}

# Run the agent

start_time = time.time()
Esempio n. 2
0
    def __init__(self, envs, acmodel, num_frames_per_proc, discount,
                 gae_lambda, entropy_coef, value_loss_coef, max_grad_norm,
                 recurrence, preprocess_obss, reshape_reward):
        """
        Initializes a `BaseAlgo` instance.

        Parameters:
        ----------
        envs : list
            a list of environments that will be run in parallel
        acmodel : torch.Module
            the model
        num_frames_per_proc : int
            the number of frames collected by every process for an update
        discount : float
            the discount for future rewards
        lr : float
            the learning rate for optimizers
        gae_lambda : float
            the lambda coefficient in the GAE formula
            ([Schulman et al., 2015](https://arxiv.org/abs/1506.02438))
        entropy_coef : float
            the weight of the entropy cost in the final objective
        value_loss_coef : float
            the weight of the value loss in the final objective
        max_grad_norm : float
            gradient will be clipped to be at most this value
        recurrence : int
            the number of steps the gradient is propagated back in time
        preprocess_obss : function
            a function that takes observations returned by the environment
            and converts them into the format that the model can handle
        reshape_reward : function
            a function that shapes the reward, takes an
            (observation, action, reward, done) tuple as an input
        """

        # Store parameters

        # self.env = ParallelEnvChunks(envs)
        self.env = ParallelEnv(envs)
        self.acmodel = acmodel
        self.acmodel.train()
        self.num_frames_per_proc = num_frames_per_proc
        self.discount = discount
        self.gae_lambda = gae_lambda
        self.entropy_coef = entropy_coef
        self.value_loss_coef = value_loss_coef
        self.max_grad_norm = max_grad_norm
        self.recurrence = recurrence
        self.preprocess_obss = preprocess_obss or default_preprocess_obss
        self.reshape_reward = reshape_reward

        # Store helpers values

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.num_procs = len(envs)
        self.num_frames = self.num_frames_per_proc * self.num_procs

        # Control parameters

        assert self.acmodel.recurrent or self.recurrence == 1
        assert self.num_frames_per_proc % self.recurrence == 0

        # Initialize experience values

        shape = (self.num_frames_per_proc, self.num_procs)

        self.obs = self.env.reset()
        self.obss = [None] * (shape[0])
        if self.acmodel.recurrent:
            self.memory = torch.zeros(shape[1],
                                      self.acmodel.memory_size,
                                      device=self.device)
            self.memories = torch.zeros(*shape,
                                        self.acmodel.memory_size,
                                        device=self.device)
        self.mask = torch.ones(shape[1], device=self.device)
        self.masks = torch.zeros(*shape, device=self.device)
        self.steer_actions = torch.zeros(*shape,
                                         device=self.device,
                                         dtype=torch.int)
        self.acc_actions = torch.zeros(*shape,
                                       device=self.device,
                                       dtype=torch.int)
        self.values = torch.zeros(*shape, device=self.device)
        self.rewards = torch.zeros(*shape, device=self.device)
        self.advantages = torch.zeros(*shape, device=self.device)
        self.steer_log_probs = torch.zeros(*shape, device=self.device)
        self.acc_log_probs = torch.zeros(*shape, device=self.device)

        # Initialize log values

        self.log_episode_return = torch.zeros(self.num_procs,
                                              device=self.device)
        self.log_episode_reshaped_return = torch.zeros(self.num_procs,
                                                       device=self.device)
        self.log_episode_num_frames = torch.zeros(self.num_procs,
                                                  device=self.device)

        self.log_done_counter = 0
        self.log_return = [0] * self.num_procs
        self.log_reshaped_return = [0] * self.num_procs
        self.log_num_frames = [0] * self.num_procs
Esempio n. 3
0
class BaseAlgoV0(ABC):
    """The base class for RL algorithms."""
    def __init__(self, envs, acmodel, num_frames_per_proc, discount,
                 gae_lambda, entropy_coef, value_loss_coef, max_grad_norm,
                 recurrence, preprocess_obss, reshape_reward):
        """
        Initializes a `BaseAlgo` instance.

        Parameters:
        ----------
        envs : list
            a list of environments that will be run in parallel
        acmodel : torch.Module
            the model
        num_frames_per_proc : int
            the number of frames collected by every process for an update
        discount : float
            the discount for future rewards
        lr : float
            the learning rate for optimizers
        gae_lambda : float
            the lambda coefficient in the GAE formula
            ([Schulman et al., 2015](https://arxiv.org/abs/1506.02438))
        entropy_coef : float
            the weight of the entropy cost in the final objective
        value_loss_coef : float
            the weight of the value loss in the final objective
        max_grad_norm : float
            gradient will be clipped to be at most this value
        recurrence : int
            the number of steps the gradient is propagated back in time
        preprocess_obss : function
            a function that takes observations returned by the environment
            and converts them into the format that the model can handle
        reshape_reward : function
            a function that shapes the reward, takes an
            (observation, action, reward, done) tuple as an input
        """

        # Store parameters

        # self.env = ParallelEnvChunks(envs)
        self.env = ParallelEnv(envs)
        self.acmodel = acmodel
        self.acmodel.train()
        self.num_frames_per_proc = num_frames_per_proc
        self.discount = discount
        self.gae_lambda = gae_lambda
        self.entropy_coef = entropy_coef
        self.value_loss_coef = value_loss_coef
        self.max_grad_norm = max_grad_norm
        self.recurrence = recurrence
        self.preprocess_obss = preprocess_obss or default_preprocess_obss
        self.reshape_reward = reshape_reward

        # Store helpers values

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.num_procs = len(envs)
        self.num_frames = self.num_frames_per_proc * self.num_procs

        # Control parameters

        assert self.acmodel.recurrent or self.recurrence == 1
        assert self.num_frames_per_proc % self.recurrence == 0

        # Initialize experience values

        shape = (self.num_frames_per_proc, self.num_procs)

        self.obs = self.env.reset()
        self.obss = [None] * (shape[0])
        if self.acmodel.recurrent:
            self.memory = torch.zeros(shape[1],
                                      self.acmodel.memory_size,
                                      device=self.device)
            self.memories = torch.zeros(*shape,
                                        self.acmodel.memory_size,
                                        device=self.device)
        self.mask = torch.ones(shape[1], device=self.device)
        self.masks = torch.zeros(*shape, device=self.device)
        self.steer_actions = torch.zeros(*shape,
                                         device=self.device,
                                         dtype=torch.int)
        self.acc_actions = torch.zeros(*shape,
                                       device=self.device,
                                       dtype=torch.int)
        self.values = torch.zeros(*shape, device=self.device)
        self.rewards = torch.zeros(*shape, device=self.device)
        self.advantages = torch.zeros(*shape, device=self.device)
        self.steer_log_probs = torch.zeros(*shape, device=self.device)
        self.acc_log_probs = torch.zeros(*shape, device=self.device)

        # Initialize log values

        self.log_episode_return = torch.zeros(self.num_procs,
                                              device=self.device)
        self.log_episode_reshaped_return = torch.zeros(self.num_procs,
                                                       device=self.device)
        self.log_episode_num_frames = torch.zeros(self.num_procs,
                                                  device=self.device)

        self.log_done_counter = 0
        self.log_return = [0] * self.num_procs
        self.log_reshaped_return = [0] * self.num_procs
        self.log_num_frames = [0] * self.num_procs

    def collect_experiences(self):
        """Collects rollouts and computes advantages.

        Runs several environments concurrently. The next actions are computed
        in a batch mode for all environments at the same time. The rollouts
        and advantages from all environments are concatenated together.

        Returns
        -------
        exps : DictList
            Contains actions, rewards, advantages etc as attributes.
            Each attribute, e.g. `exps.reward` has a shape
            (self.num_frames_per_proc * num_envs, ...). k-th block
            of consecutive `self.num_frames_per_proc` frames contains
            data obtained from the k-th environment. Be careful not to mix
            data from different environments!
        logs : dict
            Useful stats about the training process, including the average
            reward, policy loss, value loss, etc.
        """

        for i in range(self.num_frames_per_proc):
            # Do one agent-environment interaction

            preprocessed_obs = self.preprocess_obss(self.obs,
                                                    device=self.device)
            with torch.no_grad():
                if self.acmodel.recurrent:
                    dist, value, memory = self.acmodel(
                        preprocessed_obs, self.memory * self.mask.unsqueeze(1))
                else:
                    dist, value = self.acmodel(preprocessed_obs)

            steer_dist, acc_dist = dist
            steer_action = steer_dist.sample()
            acc_action = acc_dist.sample()

            actions = list(
                zip(steer_action.cpu().numpy(),
                    acc_action.cpu().numpy()))
            obs, reward, done, _ = self.env.step(actions)

            # Update experiences values

            self.obss[i] = self.obs
            self.obs = obs
            if self.acmodel.recurrent:
                self.memories[i] = self.memory
                self.memory = memory
            self.masks[i] = self.mask
            self.mask = 1 - torch.tensor(
                done, device=self.device, dtype=torch.float)
            self.steer_actions[i] = steer_action
            self.acc_actions[i] = acc_action
            self.values[i] = value
            if self.reshape_reward is not None:
                self.rewards[i] = torch.tensor([
                    self.reshape_reward(obs_, action_, reward_, done_)
                    for obs_, action_, reward_, done_ in zip(
                        obs, (steer_action, acc_action), reward, done)
                ],
                                               device=self.device)
            else:
                self.rewards[i] = torch.tensor(reward, device=self.device)
            self.steer_log_probs[i] = steer_dist.log_prob(steer_action)
            self.acc_log_probs[i] = acc_dist.log_prob(acc_action)

            # Update log values
            self.log_episode_return += torch.tensor(reward,
                                                    device=self.device,
                                                    dtype=torch.float)
            self.log_episode_reshaped_return += self.rewards[i]
            self.log_episode_num_frames += torch.ones(self.num_procs,
                                                      device=self.device)

            for i, done_ in enumerate(done):
                if done_:
                    self.log_done_counter += 1
                    self.log_return.append(self.log_episode_return[i].item())
                    self.log_reshaped_return.append(
                        self.log_episode_reshaped_return[i].item())
                    self.log_num_frames.append(
                        self.log_episode_num_frames[i].item())

            self.log_episode_return *= self.mask
            self.log_episode_reshaped_return *= self.mask
            self.log_episode_num_frames *= self.mask

        # Add advantage and return to experiences

        preprocessed_obs = self.preprocess_obss(self.obs, device=self.device)
        with torch.no_grad():
            if self.acmodel.recurrent:
                _, next_value, _ = self.acmodel(
                    preprocessed_obs, self.memory * self.mask.unsqueeze(1))
            else:
                _, next_value = self.acmodel(preprocessed_obs)

        for i in reversed(range(self.num_frames_per_proc)):
            next_mask = self.masks[
                i + 1] if i < self.num_frames_per_proc - 1 else self.mask
            next_value = self.values[
                i + 1] if i < self.num_frames_per_proc - 1 else next_value
            next_advantage = self.advantages[
                i + 1] if i < self.num_frames_per_proc - 1 else 0

            delta = self.rewards[
                i] + self.discount * next_value * next_mask - self.values[i]
            self.advantages[
                i] = delta + self.discount * self.gae_lambda * next_advantage * next_mask

        # Define experiences:
        #   the whole experience is the concatenation of the experience
        #   of each process.
        # In comments below:
        #   - T is self.num_frames_per_proc,
        #   - P is self.num_procs,
        #   - D is the dimensionality.

        exps = DictList()
        exps.obs = [
            self.obss[i][j] for j in range(self.num_procs)
            for i in range(self.num_frames_per_proc)
        ]
        if self.acmodel.recurrent:
            # T x P x D -> P x T x D -> (P * T) x D
            exps.memory = self.memories.transpose(0, 1).reshape(
                -1, *self.memories.shape[2:])
            # T x P -> P x T -> (P * T) x 1
            exps.mask = self.masks.transpose(0, 1).reshape(-1).unsqueeze(1)
        # for all tensors below, T x P -> P x T -> P * T
        exps.steer_action = self.steer_actions.transpose(0, 1).reshape(-1)
        exps.acc_action = self.acc_actions.transpose(0, 1).reshape(-1)
        exps.value = self.values.transpose(0, 1).reshape(-1)
        exps.reward = self.rewards.transpose(0, 1).reshape(-1)
        exps.advantage = self.advantages.transpose(0, 1).reshape(-1)
        exps.returnn = exps.value + exps.advantage
        exps.steer_log_prob = self.steer_log_probs.transpose(0, 1).reshape(-1)
        exps.acc_log_prob = self.acc_log_probs.transpose(0, 1).reshape(-1)

        # Preprocess experiences

        exps.obs = self.preprocess_obss(exps.obs, device=self.device)

        # Log some values

        keep = max(self.log_done_counter, self.num_procs)

        log = {
            "return_per_episode": self.log_return[-keep:],
            "reshaped_return_per_episode": self.log_reshaped_return[-keep:],
            "num_frames_per_episode": self.log_num_frames[-keep:],
            "num_frames": self.num_frames
        }

        self.log_done_counter = 0
        self.log_return = self.log_return[-self.num_procs:]
        self.log_reshaped_return = self.log_reshaped_return[-self.num_procs:]
        self.log_num_frames = self.log_num_frames[-self.num_procs:]

        return exps, log

    @abstractmethod
    def update_parameters(self):
        pass

    def evaluate(self):
        pass
Esempio n. 4
0
def start(model, seed, episodes, size):
    env_name = "MiniGrid-DoorKey-" + str(size) + "x" + str(size) + "-v0"
    utils.seed(seed)
    procs = 10
    argmax = False
    all_data = np.zeros(shape=(size, 8))
    print("Evaluating storage/" + model)
    for _wall in range(2, size - 2):

        # Generate environment
        envs = []

        for i in range(procs):
            env = gym.make(env_name)
            env.setWallID(_wall)
            envs.append(env)
        env = ParallelEnv(envs)

        # Define agent

        save_dir = utils.get_save_dir(model)
        agent = utils.Agent(save_dir, env.observation_space, argmax, procs)
        # print("CUDA available: {}\n".format(torch.cuda.is_available()))

        # Initialize logs

        logs = {"num_frames_per_episode": [], "return_per_episode": []}

        # Run the agent

        start_time = time.time()

        obss = env.reset()

        log_done_counter = 0
        log_episode_return = torch.zeros(procs, device=agent.device)
        log_episode_num_frames = torch.zeros(procs, device=agent.device)

        while log_done_counter < episodes:
            actions = agent.get_actions(obss)
            obss, rewards, dones, _ = env.step(actions)
            agent.analyze_feedbacks(rewards, dones)

            log_episode_return += torch.tensor(rewards,
                                               device=agent.device,
                                               dtype=torch.float)
            log_episode_num_frames += torch.ones(procs, device=agent.device)

            for i, done in enumerate(dones):
                if done:
                    log_done_counter += 1
                    logs["return_per_episode"].append(
                        log_episode_return[i].item())
                    logs["num_frames_per_episode"].append(
                        log_episode_num_frames[i].item())

            mask = 1 - torch.tensor(
                dones, device=agent.device, dtype=torch.float)
            log_episode_return *= mask
            log_episode_num_frames *= mask

        end_time = time.time()

        # Print logs

        num_frames = sum(logs["num_frames_per_episode"])
        fps = num_frames / (end_time - start_time)
        duration = int(end_time - start_time)
        return_per_episode = utils.synthesize(logs["return_per_episode"])
        num_frames_per_episode = utils.synthesize(
            logs["num_frames_per_episode"])

        print(
            "Wall {:3d} | F {:6.0f} | FPS {:4.0f} | D {:3d} | R:x̄σmM {:.2f} {:.2f} {:.2f} {:.2f} | F:x̄σmM {:6.1f} {:6.1f} {:6.1f} {:6.1f}"
            .format(_wall, num_frames, fps, duration,
                    *return_per_episode.values(),
                    *num_frames_per_episode.values()))

        all_data[_wall, 0] = return_per_episode["mean"]
        all_data[_wall, 1] = return_per_episode["std"]
        all_data[_wall, 2] = return_per_episode["min"]
        all_data[_wall, 3] = return_per_episode["max"]

        all_data[_wall, 4] = num_frames_per_episode["mean"]
        all_data[_wall, 5] = num_frames_per_episode["std"]
        all_data[_wall, 6] = num_frames_per_episode["min"]
        all_data[_wall, 7] = num_frames_per_episode["max"]

    return all_data