Ejemplo n.º 1
0
    def __init__(self,
                 env_name,
                 n_env,
                 acmodel,
                 demo_loc,
                 version,
                 es_method=2,
                 update_frequency=10,
                 transfer_ratio=0.15,
                 random_walk_length=1,
                 curr_method='one',
                 num_frames_per_proc=None,
                 discount=0.99,
                 lr=7e-4,
                 beta1=0.9,
                 beta2=0.999,
                 gae_lambda=0.95,
                 entropy_coef=0.01,
                 value_loss_coef=0.5,
                 max_grad_norm=0.5,
                 recurrence=4,
                 adam_eps=1e-5,
                 clip_eps=0.2,
                 epochs=4,
                 batch_size=256,
                 preprocess_obss=None,
                 reshape_reward=None,
                 aux_info=None):

        self.n_env = n_env
        self.env_name = env_name
        self.transfer_ratio = transfer_ratio
        self.random_walk_length = random_walk_length
        self.version = version
        self.update_frequency = update_frequency
        self.es_method = es_method
        super().__init__([gym.make(env_name) for _ in range(n_env)], acmodel,
                         num_frames_per_proc, discount, lr, beta1, beta2,
                         gae_lambda, entropy_coef, value_loss_coef,
                         max_grad_norm, recurrence, adam_eps, clip_eps, epochs,
                         batch_size, preprocess_obss, reshape_reward, aux_info)

        if version == "v1":
            self.good_start_states = self.read_good_start_states(
                env_name, demo_loc)
        elif version == "v2" or version == "v3":
            self.read_good_start_states_v2(env_name, demo_loc, curr_method)
        self.env = None
        self.env = RCParallelEnv(self.env_name, self.n_env, demo_loc,
                                 curr_method)
        self.obs = self.env.reset()

        self.update = 0
        self.curr_update = 1
        self.log_history = []
        self.es_max = -1
        self.es_pat = 0
        self.curr_done = False
        self.curr_really_done = False
Ejemplo n.º 2
0
class BaseAlgo(ABC):
    """The base class for RL algorithms."""

    def __init__(self, envs, acmodel, num_frames_per_proc, discount, lr, gae_lambda, entropy_coef,
                 value_loss_coef, max_grad_norm, recurrence, preprocess_obss, reshape_reward, aux_info):
        """
        Initializes a `BaseAlgo` instance.

        Parameters:
        ----------
        envs : list
            a list of environments that will be run in parallel
        acmodel : torch.Module
            the model
        num_frames_per_proc : int
            the number of frames collected by every process for an update
        discount : float
            the discount for future rewards
        lr : float
            the learning rate for optimizers
        gae_lambda : float
            the lambda coefficient in the GAE formula
            ([Schulman et al., 2015](https://arxiv.org/abs/1506.02438))
        entropy_coef : float
            the weight of the entropy cost in the final objective
        value_loss_coef : float
            the weight of the value loss in the final objective
        max_grad_norm : float
            gradient will be clipped to be at most this value
        recurrence : int
            the number of steps the gradient is propagated back in time
        preprocess_obss : function
            a function that takes observations returned by the environment
            and converts them into the format that the model can handle
        reshape_reward : function
            a function that shapes the reward, takes an
            (observation, action, reward, done) tuple as an input
        aux_info : list
            a list of strings corresponding to the name of the extra information
            retrieved from the environment for supervised auxiliary losses

        """
        # Store parameters

        self.env = ParallelEnv(envs)
        self.acmodel = acmodel
        self.acmodel.train()
        self.num_frames_per_proc = num_frames_per_proc
        self.discount = discount
        self.lr = lr
        self.gae_lambda = gae_lambda
        self.entropy_coef = entropy_coef
        self.value_loss_coef = value_loss_coef
        self.max_grad_norm = max_grad_norm
        self.recurrence = recurrence
        self.preprocess_obss = preprocess_obss or default_preprocess_obss
        self.reshape_reward = reshape_reward
        self.aux_info = aux_info
        # Store helpers values

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.num_procs = len(envs)
        self.num_frames = self.num_frames_per_proc * self.num_procs


        assert self.num_frames_per_proc % self.recurrence == 0

        # Initialize experience values

        shape = (self.num_frames_per_proc, self.num_procs)

        self.obs = self.env.reset()
        self.obss = [None]*(shape[0])

        self.memory = torch.zeros(shape[1], self.acmodel.memory_size, device=self.device)
        self.memories = torch.zeros(*shape, self.acmodel.memory_size, device=self.device)

        self.mask = torch.ones(shape[1], device=self.device)
        self.masks = torch.zeros(*shape, device=self.device)
        self.actions = torch.zeros(*shape, device=self.device, dtype=torch.int)
        self.values = torch.zeros(*shape, device=self.device)
        self.rewards = torch.zeros(*shape, device=self.device)
        self.advantages = torch.zeros(*shape, device=self.device)
        self.log_probs = torch.zeros(*shape, device=self.device)

        if self.aux_info:
            self.aux_info_collector = ExtraInfoCollector(self.aux_info, shape, self.device)

        # Initialize log values

        self.log_episode_return = torch.zeros(self.num_procs, device=self.device)
        self.log_episode_reshaped_return = torch.zeros(self.num_procs, device=self.device)
        self.log_episode_num_frames = torch.zeros(self.num_procs, device=self.device)

        self.log_done_counter = 0
        self.log_return = [0] * self.num_procs
        self.log_reshaped_return = [0] * self.num_procs
        self.log_num_frames = [0] * self.num_procs
        self.found_true = False

    def collect_experiences(self):
        """Collects rollouts and computes advantages.

        Runs several environments concurrently. The next actions are computed
        in a batch mode for all environments at the same time. The rollouts
        and advantages from all environments are concatenated together.

        Returns
        -------
        exps : DictList
            Contains actions, rewards, advantages etc as attributes.
            Each attribute, e.g. `exps.reward` has a shape
            (self.num_frames_per_proc * num_envs, ...). k-th block
            of consecutive `self.num_frames_per_proc` frames contains
            data obtained from the k-th environment. Be careful not to mix
            data from different environments!
        logs : dict
            Useful stats about the training process, including the average
            reward, policy loss, value loss, etc.

        """
        from pdb import set_trace as st
        for i in range(self.num_frames_per_proc):
            # Do one agent-environment interaction
            preprocessed_obs = self.preprocess_obss(self.obs, device=self.device)

            with torch.no_grad():
                from pdb import set_trace as st
                st()
                model_results = self.acmodel(preprocessed_obs, self.memory * self.mask.unsqueeze(1))
                dist = model_results['dist']
                value = model_results['value']
                memory = model_results['memory']
                extra_predictions = model_results['extra_predictions']


            action = dist.sample()
            obs, reward, done, env_info = self.env.step(action.cpu().numpy())
            if self.aux_info:
                env_info = self.aux_info_collector.process(env_info)
                # env_info = self.process_aux_info(env_info)

            # Update experiences values

            self.obss[i] = self.obs
            self.obs = obs

            self.memories[i] = self.memory
            self.memory = memory

            self.masks[i] = self.mask


            self.mask = 1 - torch.tensor(done, device=self.device, dtype=torch.float)
            self.actions[i] = action
            self.values[i] = value
            if self.reshape_reward is not None:
                self.rewards[i] = torch.tensor([
                    self.reshape_reward(obs_, action_, reward_, done_)
                    for obs_, action_, reward_, done_ in zip(obs, action, reward, done)
                ], device=self.device)

            else:
                self.rewards[i] = torch.tensor(reward, device=self.device)
            self.log_probs[i] = dist.log_prob(action)

            if self.aux_info:
                self.aux_info_collector.fill_dictionaries(i, env_info, extra_predictions)

            # Update log values

            self.log_episode_return += torch.tensor(reward, device=self.device, dtype=torch.float)
            self.log_episode_reshaped_return += self.rewards[i]
            self.log_episode_num_frames += torch.ones(self.num_procs, device=self.device)

            for i, done_ in enumerate(done):
                if done_:
                    self.log_done_counter += 1
                    self.log_return.append(self.log_episode_return[i].item())
                    self.log_reshaped_return.append(self.log_episode_reshaped_return[i].item())
                    self.log_num_frames.append(self.log_episode_num_frames[i].item())

            self.log_episode_return *= self.mask
            self.log_episode_reshaped_return *= self.mask
            self.log_episode_num_frames *= self.mask

        # Add advantage and return to experiences

        preprocessed_obs = self.preprocess_obss(self.obs, device=self.device)
        with torch.no_grad():
            next_value = self.acmodel(preprocessed_obs, self.memory * self.mask.unsqueeze(1))['value']

        for i in reversed(range(self.num_frames_per_proc)):
            next_mask = self.masks[i+1] if i < self.num_frames_per_proc - 1 else self.mask
            next_value = self.values[i+1] if i < self.num_frames_per_proc - 1 else next_value
            next_advantage = self.advantages[i+1] if i < self.num_frames_per_proc - 1 else 0

            delta = self.rewards[i] + self.discount * next_value * next_mask - self.values[i]
            self.advantages[i] = delta + self.discount * self.gae_lambda * next_advantage * next_mask

        # Flatten the data correctly, making sure that
        # each episode's data is a continuous chunk

        exps = DictList()
        exps.obs = [self.obss[i][j]
                    for j in range(self.num_procs)
                    for i in range(self.num_frames_per_proc)]
        # In commments below T is self.num_frames_per_proc, P is self.num_procs,
        # D is the dimensionality

        # T x P x D -> P x T x D -> (P * T) x D
        exps.memory = self.memories.transpose(0, 1).reshape(-1, *self.memories.shape[2:])
        # T x P -> P x T -> (P * T) x 1
        exps.mask = self.masks.transpose(0, 1).reshape(-1).unsqueeze(1)

        # for all tensors below, T x P -> P x T -> P * T
        exps.action = self.actions.transpose(0, 1).reshape(-1)
        exps.value = self.values.transpose(0, 1).reshape(-1)
        exps.reward = self.rewards.transpose(0, 1).reshape(-1)
        exps.advantage = self.advantages.transpose(0, 1).reshape(-1)
        exps.returnn = exps.value + exps.advantage
        exps.log_prob = self.log_probs.transpose(0, 1).reshape(-1)

        if self.aux_info:
            exps = self.aux_info_collector.end_collection(exps)

        # Preprocess experiences

        exps.obs = self.preprocess_obss(exps.obs, device=self.device)

        # Log some values

        keep = max(self.log_done_counter, self.num_procs)

        log = {
            "return_per_episode": self.log_return[-keep:],
            "reshaped_return_per_episode": self.log_reshaped_return[-keep:],
            "num_frames_per_episode": self.log_num_frames[-keep:],
            "num_frames": self.num_frames,
            "episodes_done": self.log_done_counter,
        }

        self.log_done_counter = 0
        self.log_return = self.log_return[-self.num_procs:]
        self.log_reshaped_return = self.log_reshaped_return[-self.num_procs:]
        self.log_num_frames = self.log_num_frames[-self.num_procs:]

        return exps, log

    @abstractmethod
    def update_parameters(self):
        pass
Ejemplo n.º 3
0
    def __init__(self, envs, acmodel, num_frames_per_proc, discount, lr, gae_lambda, entropy_coef,
                 value_loss_coef, max_grad_norm, recurrence, preprocess_obss, reshape_reward, aux_info):
        """
        Initializes a `BaseAlgo` instance.

        Parameters:
        ----------
        envs : list
            a list of environments that will be run in parallel
        acmodel : torch.Module
            the model
        num_frames_per_proc : int
            the number of frames collected by every process for an update
        discount : float
            the discount for future rewards
        lr : float
            the learning rate for optimizers
        gae_lambda : float
            the lambda coefficient in the GAE formula
            ([Schulman et al., 2015](https://arxiv.org/abs/1506.02438))
        entropy_coef : float
            the weight of the entropy cost in the final objective
        value_loss_coef : float
            the weight of the value loss in the final objective
        max_grad_norm : float
            gradient will be clipped to be at most this value
        recurrence : int
            the number of steps the gradient is propagated back in time
        preprocess_obss : function
            a function that takes observations returned by the environment
            and converts them into the format that the model can handle
        reshape_reward : function
            a function that shapes the reward, takes an
            (observation, action, reward, done) tuple as an input
        aux_info : list
            a list of strings corresponding to the name of the extra information
            retrieved from the environment for supervised auxiliary losses

        """
        # Store parameters

        self.env = ParallelEnv(envs)
        self.acmodel = acmodel
        self.acmodel.train()
        self.num_frames_per_proc = num_frames_per_proc
        self.discount = discount
        self.lr = lr
        self.gae_lambda = gae_lambda
        self.entropy_coef = entropy_coef
        self.value_loss_coef = value_loss_coef
        self.max_grad_norm = max_grad_norm
        self.recurrence = recurrence
        self.preprocess_obss = preprocess_obss or default_preprocess_obss
        self.reshape_reward = reshape_reward
        self.aux_info = aux_info
        # Store helpers values

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.num_procs = len(envs)
        self.num_frames = self.num_frames_per_proc * self.num_procs


        assert self.num_frames_per_proc % self.recurrence == 0

        # Initialize experience values

        shape = (self.num_frames_per_proc, self.num_procs)

        self.obs = self.env.reset()
        self.obss = [None]*(shape[0])

        self.memory = torch.zeros(shape[1], self.acmodel.memory_size, device=self.device)
        self.memories = torch.zeros(*shape, self.acmodel.memory_size, device=self.device)

        self.mask = torch.ones(shape[1], device=self.device)
        self.masks = torch.zeros(*shape, device=self.device)
        self.actions = torch.zeros(*shape, device=self.device, dtype=torch.int)
        self.values = torch.zeros(*shape, device=self.device)
        self.rewards = torch.zeros(*shape, device=self.device)
        self.advantages = torch.zeros(*shape, device=self.device)
        self.log_probs = torch.zeros(*shape, device=self.device)

        if self.aux_info:
            self.aux_info_collector = ExtraInfoCollector(self.aux_info, shape, self.device)

        # Initialize log values

        self.log_episode_return = torch.zeros(self.num_procs, device=self.device)
        self.log_episode_reshaped_return = torch.zeros(self.num_procs, device=self.device)
        self.log_episode_num_frames = torch.zeros(self.num_procs, device=self.device)

        self.log_done_counter = 0
        self.log_return = [0] * self.num_procs
        self.log_reshaped_return = [0] * self.num_procs
        self.log_num_frames = [0] * self.num_procs
        self.found_true = False
Ejemplo n.º 4
0
class BaseAlgo(ABC):
    """The base class for RL algorithms."""
    def __init__(self, envs0, envs1, acmodel0, acmodel1, num_frames_per_proc,
                 discount, lr, gae_lambda, entropy_coef, value_loss_coef,
                 max_grad_norm, recurrence, preprocess_obss, reshape_reward,
                 aux_info):
        """
        Initializes a `BaseAlgo` instance.

        Parameters:
        ----------
        envs : list
            a list of environments that will be run in parallel
        acmodel : torch.Module
            the model
        num_frames_per_proc : int
            the number of frames collected by every process for an update
        discount : float
            the discount for future rewards
        lr : float
            the learning rate for optimizers
        gae_lambda : float
            the lambda coefficient in the GAE formula
            ([Schulman et al., 2015](https://arxiv.org/abs/1506.02438))
        entropy_coef : float
            the weight of the entropy cost in the final objective
        value_loss_coef : float
            the weight of the value loss in the final objective
        max_grad_norm : float
            gradient will be clipped to be at most this value
        recurrence : int
            the number of steps the gradient is propagated back in time
        preprocess_obss : function
            a function that takes observations returned by the environment
            and converts them into the format that the model can handle
        reshape_reward : function
            a function that shapes the reward, takes an
            (observation, action, reward, done) tuple as an input
        aux_info : list
            a list of strings corresponding to the name of the extra information
            retrieved from the environment for supervised auxiliary losses

        """
        # Store parameters

        self.env0 = ParallelEnv(envs0)
        self.acmodel0 = acmodel0
        self.acmodel0.train()

        self.env1 = ParallelEnv(envs1)
        self.acmodel1 = acmodel1
        self.acmodel1.train()

        self.num_frames_per_proc = num_frames_per_proc
        self.discount = discount
        self.lr = lr
        self.gae_lambda = gae_lambda
        self.entropy_coef = entropy_coef
        self.value_loss_coef = value_loss_coef
        self.max_grad_norm = max_grad_norm
        self.recurrence = recurrence
        self.preprocess_obss = preprocess_obss or default_preprocess_obss
        self.reshape_reward = reshape_reward
        self.aux_info = aux_info

        # Store helpers values

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.num_procs = len(envs0)
        self.num_frames = self.num_frames_per_proc * self.num_procs

        assert self.num_frames_per_proc % self.recurrence == 0

        # Initialize experience values

        shape = (self.num_frames_per_proc, self.num_procs)

        self.obs0 = self.env0.reset()
        self.obss0 = [None] * (shape[0])

        self.obs1 = self.env1.reset()
        self.obss1 = [None] * (shape[0])

        self.memory0 = torch.zeros(shape[1],
                                   self.acmodel0.memory_size,
                                   device=self.device)
        self.memories0 = torch.zeros(*shape,
                                     self.acmodel0.memory_size,
                                     device=self.device)

        self.memory1 = torch.zeros(shape[1],
                                   self.acmodel1.memory_size,
                                   device=self.device)
        self.memories1 = torch.zeros(*shape,
                                     self.acmodel1.memory_size,
                                     device=self.device)

        self.msg0 = torch.zeros(self.acmodel0.max_len_msg,
                                shape[1],
                                self.acmodel0.num_symbols,
                                device=self.device)
        self.msgs0 = torch.zeros(shape[0],
                                 self.acmodel0.max_len_msg,
                                 shape[1],
                                 self.acmodel0.num_symbols,
                                 device=self.device)

        self.msg1 = torch.zeros(self.acmodel1.max_len_msg,
                                shape[1],
                                self.acmodel1.num_symbols,
                                device=self.device)
        self.msgs1 = torch.zeros(shape[0],
                                 self.acmodel1.max_len_msg,
                                 shape[1],
                                 self.acmodel1.num_symbols,
                                 device=self.device)

        self.msgs_out0 = torch.zeros(shape[0],
                                     self.acmodel0.max_len_msg,
                                     shape[1],
                                     self.acmodel0.num_symbols,
                                     device=self.device)

        self.msgs_out1 = torch.zeros(shape[0],
                                     self.acmodel1.max_len_msg,
                                     shape[1],
                                     self.acmodel1.num_symbols,
                                     device=self.device)

        #self.rng_states0 = torch.zeros(*shape, *torch.get_rng_state().shape, dtype=torch.uint8)
        #if torch.cuda.is_available():
        #    self.cuda_rng_states0 = torch.zeros(*shape, *torch.cuda.get_rng_state().shape, dtype=torch.uint8)

        #self.rng_states1 = torch.zeros(*shape, *torch.get_rng_state().shape, dtype=torch.uint8)
        #if torch.cuda.is_available():
        #    self.cuda_rng_states1 = torch.zeros(*shape, *torch.cuda.get_rng_state().shape, dtype=torch.uint8)

        self.mask0 = torch.ones(shape[1], device=self.device)
        self.masks0 = torch.zeros(*shape, device=self.device)
        self.actions0 = torch.zeros(*shape,
                                    device=self.device,
                                    dtype=torch.int)
        self.values0 = torch.zeros(*shape, device=self.device)
        self.rewards0 = torch.zeros(*shape, device=self.device)
        self.advantages0 = torch.zeros(*shape, device=self.device)
        self.log_probs0 = torch.zeros(*shape, device=self.device)
        self.speaker_log_probs0 = torch.zeros(*shape, device=self.device)

        self.mask1 = torch.ones(shape[1], device=self.device)
        self.masks1 = torch.zeros(*shape, device=self.device)
        self.actions1 = torch.zeros(*shape,
                                    device=self.device,
                                    dtype=torch.int)
        self.values1 = torch.zeros(*shape, device=self.device)
        self.rewards1 = torch.zeros(*shape, device=self.device)
        self.advantages1 = torch.zeros(*shape, device=self.device)
        self.log_probs1 = torch.zeros(*shape, device=self.device)
        self.speaker_log_probs1 = torch.zeros(*shape, device=self.device)

        if self.aux_info:
            self.aux_info_collector0 = ExtraInfoCollector(
                self.aux_info, shape, self.device)
            self.aux_info_collector1 = ExtraInfoCollector(
                self.aux_info, shape, self.device)

        # Initialize log values

        self.log_episode_return0 = torch.zeros(self.num_procs,
                                               device=self.device)
        self.log_episode_reshaped_return0 = torch.zeros(self.num_procs,
                                                        device=self.device)

        self.log_episode_return1 = torch.zeros(self.num_procs,
                                               device=self.device)
        self.log_episode_reshaped_return1 = torch.zeros(self.num_procs,
                                                        device=self.device)

        self.log_episode_num_frames0 = torch.zeros(self.num_procs,
                                                   device=self.device)
        self.log_episode_num_frames1 = torch.zeros(self.num_procs,
                                                   device=self.device)

        self.log_done_counter0 = 0
        self.log_return0 = [0] * self.num_procs
        self.log_reshaped_return0 = [0] * self.num_procs
        self.log_num_frames0 = [0] * self.num_procs

        self.log_done_counter1 = 0
        self.log_return1 = [0] * self.num_procs
        self.log_reshaped_return1 = [0] * self.num_procs
        self.log_num_frames1 = [0] * self.num_procs

        self.been_done0 = torch.zeros(self.num_procs, device=self.device)
        self.been_done1 = torch.zeros(self.num_procs, device=self.device)

    def collect_experiences(self):
        """Collects rollouts and computes advantages.

        Runs several environments concurrently. The next actions are computed
        in a batch mode for all environments at the same time. The rollouts
        and advantages from all environments are concatenated together.

        Returns
        -------
        exps : DictList
            Contains actions, rewards, advantages etc as attributes.
            Each attribute, e.g. `exps.reward` has a shape
            (self.num_frames_per_proc * num_envs, ...). k-th block
            of consecutive `self.num_frames_per_proc` frames contains
            data obtained from the k-th environment. Be careful not to mix
            data from different environments!
        logs : dict
            Useful stats about the training process, including the average
            reward, policy loss, value loss, etc.

        """
        for i in range(self.num_frames_per_proc):
            # Do one agent-environment interaction

            preprocessed_obs0 = self.preprocess_obss(self.obs0,
                                                     device=self.device)

            preprocessed_obs1 = self.preprocess_obss(self.obs1,
                                                     device=self.device)

            with torch.no_grad():

                model_results0 = self.acmodel0(
                    preprocessed_obs1,
                    self.memory0 * self.mask0.unsqueeze(1))  ### NOTE

                dist0 = model_results0['dist']  ### NOTE
                value0 = model_results0['value']
                memory0 = model_results0['memory']
                msg0 = model_results0['message']
                dists_speaker0 = model_results0['dists_speaker']
                extra_predictions0 = model_results0['extra_predictions']
                #self.rng_states0[i] = model_results0['rng_states']
                #if torch.cuda.is_available():
                #    self.cuda_rng_states0[i] = model_results0['cuda_rng_states']

                preprocessed_obs0.instr *= 0
                preprocessed_obs0.image *= 0
                model_results1 = self.acmodel1(
                    preprocessed_obs0,
                    self.memory1 * self.mask1.unsqueeze(1),
                    msg=(msg0.transpose(0, 1) *
                         self.mask1.unsqueeze(1).unsqueeze(2)).transpose(
                             0, 1))  ### NOTE

                dist1 = model_results1['dist']
                value1 = model_results1['value']
                memory1 = model_results1['memory']
                msg1 = model_results1['message']
                dists_speaker1 = model_results1['dists_speaker']
                extra_predictions1 = model_results1['extra_predictions']
                #self.rng_states1[i] = model_results1['rng_states']
                #if torch.cuda.is_available():
                #    self.cuda_rng_states1[i] = model_results1['cuda_rng_states']

            #state = torch.get_rng_state()
            action0 = dist0.sample()

            #torch.set_rng_state(state)
            action1 = dist1.sample()

            obs0, reward0, done0, env_info0 = self.env0.step(
                action0.cpu().numpy())

            obs1, reward1, done1, env_info1 = self.env1.step(
                action1.cpu().numpy())

            # mask any rewards based on (previous) been_done
            rewardos0 = [0] * self.num_procs
            rewardos1 = [0] * self.num_procs
            for j in range(self.num_procs):
                rewardos0[j] = reward0[j] * (1 - self.been_done0[j].item())
                rewardos1[j] = reward1[j] * (1 - self.been_done1[j].item())

            reward0 = tuple(rewardos0)
            reward1 = tuple(rewardos1)

            #reward0 = tuple(0.5*r0 + 0.5*r1 for r0, r1 in zip(reward0, reward1)) ### NOTE
            #reward1 = reward0

            # reward sender agent (0) equally for success of receiver agent (1) ### NOTE
            reward0 = reward1

            self.been_done0 = (1 - (1 - self.been_done0) * (1 - torch.tensor(
                done0, device=self.device, dtype=torch.float)))
            self.been_done1 = (1 - (1 - self.been_done1) * (1 - torch.tensor(
                done1, device=self.device, dtype=torch.float)))
            both_done = self.been_done0 * self.been_done1

            # reset if receiver agent (1) is done ### NOTE
            both_done = self.been_done1

            obs0 = self.env0.sync_reset(both_done, obs0)
            obs1 = self.env1.sync_reset(both_done, obs1)

            if self.aux_info:
                env_info0 = self.aux_info_collector0.process(env_info0)
                # env_info0 = self.process_aux_info0(env_info0)

                env_info1 = self.aux_info_collector1.process(env_info1)
                # env_info1 = self.process_aux_info1(env_info1)

            # Update experiences values

            self.obss0[i] = self.obs0
            self.obs0 = obs0

            self.obss1[i] = self.obs1
            self.obs1 = obs1

            self.memories0[i] = self.memory0
            self.memory0 = memory0

            self.memories1[i] = self.memory1
            self.memory1 = memory1

            self.msgs0[i] = self.msg0
            self.msg0 = msg0

            self.msgs1[i] = self.msg1
            self.msg1 = msg1

            self.msgs_out0[i] = msg0

            self.msgs_out1[i] = msg1

            self.masks0[i] = self.mask0
            #self.mask0       = 1 - torch.tensor(done0, device=self.device, dtype=torch.float)
            self.mask0 = 1 - both_done
            self.actions0[i] = action0
            self.values0[i] = value0
            if self.reshape_reward is not None:
                self.rewards0[i] = torch.tensor([
                    self.reshape_reward(obs_, action_, reward_, done_)
                    for obs_, action_, reward_, done_ in zip(
                        obs0, action0, reward0, done0)
                ],
                                                device=self.device)
            else:
                self.rewards0[i] = torch.tensor(reward0, device=self.device)
            self.log_probs0[i] = dist0.log_prob(action0)
            self.speaker_log_probs0[i] = self.acmodel0.speaker_log_prob(
                dists_speaker0, msg0)

            self.masks1[i] = self.mask1
            #self.mask1       = 1 - torch.tensor(done1, device=self.device, dtype=torch.float)
            self.mask1 = 1 - both_done
            self.actions1[i] = action1
            self.values1[i] = value1
            if self.reshape_reward is not None:
                self.rewards1[i] = torch.tensor([
                    self.reshape_reward(obs_, action_, reward_, done_)
                    for obs_, action_, reward_, done_ in zip(
                        obs1, action1, reward1, done1)
                ],
                                                device=self.device)
            else:
                self.rewards1[i] = torch.tensor(reward1, device=self.device)
            self.log_probs1[i] = dist1.log_prob(action1)
            self.speaker_log_probs1[i] = self.acmodel1.speaker_log_prob(
                dists_speaker1, msg1)

            if self.aux_info:
                self.aux_info_collector0.fill_dictionaries(
                    i, env_info0, extra_predictions0)

                self.aux_info_collector1.fill_dictionaries(
                    i, env_info1, extra_predictions1)

            # Update log values

            self.log_episode_return0 += torch.tensor(reward0,
                                                     device=self.device,
                                                     dtype=torch.float)
            self.log_episode_reshaped_return0 += self.rewards0[i]

            self.log_episode_return1 += torch.tensor(reward1,
                                                     device=self.device,
                                                     dtype=torch.float)
            self.log_episode_reshaped_return1 += self.rewards1[i]

            self.log_episode_num_frames0 += torch.ones(self.num_procs,
                                                       device=self.device)
            self.log_episode_num_frames1 += torch.ones(self.num_procs,
                                                       device=self.device)

            #for i, done_ in enumerate(done0):
            for i in range(self.num_procs):
                #if done_:
                if both_done[i]:
                    self.log_done_counter0 += 1
                    self.log_return0.append(self.log_episode_return0[i].item())
                    self.log_reshaped_return0.append(
                        self.log_episode_reshaped_return0[i].item())
                    self.log_num_frames0.append(
                        self.log_episode_num_frames0[i].item())

                    #for i, done_ in enumerate(done1):
                    #if done_:
                    self.log_done_counter1 += 1
                    self.log_return1.append(self.log_episode_return1[i].item())
                    self.log_reshaped_return1.append(
                        self.log_episode_reshaped_return1[i].item())
                    self.log_num_frames1.append(
                        self.log_episode_num_frames1[i].item())

            # if both are done, reset both to not done
            self.been_done0 *= (1 - both_done)
            self.been_done1 *= (1 - both_done)

            self.log_episode_return0 *= self.mask0
            self.log_episode_reshaped_return0 *= self.mask0
            self.log_episode_num_frames0 *= self.mask0

            self.log_episode_return1 *= self.mask1
            self.log_episode_reshaped_return1 *= self.mask1
            self.log_episode_num_frames1 *= self.mask1

        # Add advantage and return to experiences

        preprocessed_obs0 = self.preprocess_obss(self.obs0, device=self.device)
        preprocessed_obs1 = self.preprocess_obss(self.obs1, device=self.device)

        with torch.no_grad():
            tmp = self.acmodel0(preprocessed_obs1,
                                self.memory0 *
                                self.mask0.unsqueeze(1))  ### NOTE
            next_value0 = tmp['value']

            preprocessed_obs0.instr *= 0
            preprocessed_obs0.image *= 0
            next_value1 = self.acmodel1(
                preprocessed_obs0,
                self.memory1 * self.mask1.unsqueeze(1),
                msg=(tmp['message'].transpose(0, 1) *
                     self.mask1.unsqueeze(1).unsqueeze(2)).transpose(
                         0, 1))['value']  ### NOTE

        for i in reversed(range(self.num_frames_per_proc)):
            next_mask0 = self.masks0[
                i + 1] if i < self.num_frames_per_proc - 1 else self.mask0
            next_value0 = self.values0[
                i + 1] if i < self.num_frames_per_proc - 1 else next_value0
            next_advantage0 = self.advantages0[
                i + 1] if i < self.num_frames_per_proc - 1 else 0

            next_mask1 = self.masks1[
                i + 1] if i < self.num_frames_per_proc - 1 else self.mask1
            next_value1 = self.values1[
                i + 1] if i < self.num_frames_per_proc - 1 else next_value1
            next_advantage1 = self.advantages1[
                i + 1] if i < self.num_frames_per_proc - 1 else 0

            delta0 = self.rewards0[
                i] + self.discount * next_value0 * next_mask0 - self.values0[i]
            self.advantages0[
                i] = delta0 + self.discount * self.gae_lambda * next_advantage0 * next_mask0

            delta1 = self.rewards1[
                i] + self.discount * next_value1 * next_mask1 - self.values1[i]
            self.advantages1[
                i] = delta1 + self.discount * self.gae_lambda * next_advantage1 * next_mask1

        # Flatten the data correctly, making sure that
        # each episode's data is a continuous chunk

        exps0 = DictList()
        exps0.obs = [
            self.obss0[i][j] for j in range(self.num_procs)
            for i in range(self.num_frames_per_proc)
        ]

        exps1 = DictList()
        exps1.obs = [
            self.obss1[i][j] for j in range(self.num_procs)
            for i in range(self.num_frames_per_proc)
        ]

        # In commments below T is self.num_frames_per_proc, P is self.num_procs,
        # D is the dimensionality

        # T x P x D -> P x T x D -> (P * T) x D
        exps0.memory = self.memories0.transpose(0, 1).reshape(
            -1, *self.memories0.shape[2:])

        exps1.memory = self.memories1.transpose(0, 1).reshape(
            -1, *self.memories1.shape[2:])

        exps0.message = self.msgs0.transpose(1, 2).transpose(0, 1).reshape(
            -1, self.acmodel0.max_len_msg, self.acmodel0.num_symbols)

        exps1.message = self.msgs1.transpose(1, 2).transpose(0, 1).reshape(
            -1, self.acmodel1.max_len_msg, self.acmodel1.num_symbols)

        exps0.message_out = self.msgs_out0.transpose(1, 2).transpose(
            0, 1).reshape(-1, self.acmodel0.max_len_msg,
                          self.acmodel0.num_symbols)

        exps1.message_out = self.msgs_out1.transpose(1, 2).transpose(
            0, 1).reshape(-1, self.acmodel1.max_len_msg,
                          self.acmodel1.num_symbols)

        #exps0.rng_states = self.rng_states0.transpose(0, 1).reshape(-1, *self.rng_states0.shape[2:])
        #if torch.cuda.is_available():
        #    exps0.cuda_rng_states = self.cuda_rng_states0.transpose(0, 1).reshape(-1, *self.cuda_rng_states0.shape[2:])

        #exps1.rng_states = self.rng_states1.transpose(0, 1).reshape(-1, *self.rng_states1.shape[2:])
        #if torch.cuda.is_available():
        #    exps1.cuda_rng_states = self.cuda_rng_states1.transpose(0, 1).reshape(-1, *self.cuda_rng_states1.shape[2:])

        # T x P -> P x T -> (P * T) x 1
        exps0.mask = self.masks0.transpose(0, 1).reshape(-1).unsqueeze(1)

        exps1.mask = self.masks1.transpose(0, 1).reshape(-1).unsqueeze(1)

        # for all tensors below, T x P -> P x T -> P * T
        exps0.action = self.actions0.transpose(0, 1).reshape(-1)
        exps0.value = self.values0.transpose(0, 1).reshape(-1)
        exps0.reward = self.rewards0.transpose(0, 1).reshape(-1)
        exps0.advantage = self.advantages0.transpose(0, 1).reshape(-1)
        exps0.returnn = exps0.value + exps0.advantage
        exps0.log_prob = self.log_probs0.transpose(0, 1).reshape(-1)
        exps0.speaker_log_prob = self.speaker_log_probs0.transpose(
            0, 1).reshape(-1)

        exps1.action = self.actions1.transpose(0, 1).reshape(-1)
        exps1.value = self.values1.transpose(0, 1).reshape(-1)
        exps1.reward = self.rewards1.transpose(0, 1).reshape(-1)
        exps1.advantage = self.advantages1.transpose(0, 1).reshape(-1)
        exps1.returnn = exps1.value + exps1.advantage
        exps1.log_prob = self.log_probs1.transpose(0, 1).reshape(-1)
        exps1.speaker_log_prob = self.speaker_log_probs1.transpose(
            0, 1).reshape(-1)

        if self.aux_info:
            exps0 = self.aux_info_collector0.end_collection(exps0)

            exps1 = self.aux_info_collector1.end_collection(exps1)

        # Preprocess experiences

        exps0.obs = self.preprocess_obss(exps0.obs, device=self.device)

        exps1.obs = self.preprocess_obss(exps1.obs, device=self.device)

        # Log some values

        keep0 = max(self.log_done_counter0, self.num_procs)

        keep1 = max(self.log_done_counter1, self.num_procs)

        log0 = {
            "return_per_episode": self.log_return0[-keep0:],
            "reshaped_return_per_episode": self.log_reshaped_return0[-keep0:],
            "num_frames_per_episode": self.log_num_frames0[-keep0:],
            "num_frames": self.num_frames,
            "episodes_done": self.log_done_counter0,
        }

        log1 = {
            "return_per_episode": self.log_return1[-keep1:],
            "reshaped_return_per_episode": self.log_reshaped_return1[-keep1:],
            "num_frames_per_episode": self.log_num_frames1[-keep1:],
            "num_frames": self.num_frames,
            "episodes_done": self.log_done_counter1,
        }

        self.log_done_counter0 = 0
        self.log_return0 = self.log_return0[-self.num_procs:]
        self.log_reshaped_return0 = self.log_reshaped_return0[-self.num_procs:]
        self.log_num_frames0 = self.log_num_frames0[-self.num_procs:]

        self.log_done_counter1 = 0
        self.log_return1 = self.log_return1[-self.num_procs:]
        self.log_reshaped_return1 = self.log_reshaped_return1[-self.num_procs:]
        self.log_num_frames1 = self.log_num_frames1[-self.num_procs:]

        return exps0, log0, exps1, log1

    @abstractmethod
    def update_parameters(self):
        pass
Ejemplo n.º 5
0
def train_model():
    # Create command line argument parser
    parser = init_argparser()
    opt = parser.parse_args()

    # Start logger first
    init_logging(opt.log_level)

    #  validate chosen options
    opt = validate_options(parser, opt)

    # Prepare logging and environment
    envs = []
    for i in range(opt.num_processes):
        env = gym.make(opt.env_name)
        env.seed(100 * opt.seed + i)
        envs.append(env)
    from babyai.rl.utils import ParallelEnv
    p_envs = ParallelEnv(envs)

    # Create model name
    model_name = get_model_name(opt)

    # Observation preprocessor
    obss_preprocessor = ObssPreprocessor(model_name,
                                         envs[0].observation_space,
                                         load_vocab_from=opt.vocab_file,
                                         segment_level=opt.segment_level)
    obss_preprocessor.vocab.save()

    def reshape_reward(_0, _1, reward, _2):
        return opt.reward_scale * reward

    algo = 'ppo'
    if opt.resume:
        if opt.reasoning:
            if opt.diag_targets == 18:
                model = machine.util.RLCheckpoint.load_partial_model(
                    opt.load_checkpoint)
            else:
                model = machine.util.RLCheckpoint.load_partial_model(
                    opt.load_checkpoint,
                    diag_targets=opt.diag_targets,
                    drop_diag=opt.drop_diag)
            model.detach = opt.detach_hidden
        else:
            model = machine.util.RLCheckpoint.load_model(opt.load_checkpoint)
            model.train()
    else:
        if opt.reasoning:
            if opt.min_model:
                model = MinModel(obss_preprocessor.obs_space,
                                 envs[0].action_space, opt.image_dim,
                                 opt.memory_dim, opt.instr_dim)
            else:
                model = IACModel(obss_preprocessor.obs_space,
                                 envs[0].action_space,
                                 opt.image_dim,
                                 opt.memory_dim,
                                 opt.instr_dim,
                                 not opt.no_instr,
                                 opt.instr_arch,
                                 not opt.no_mem,
                                 opt.arch,
                                 opt.diag_targets,
                                 detach=opt.detach_hidden)

        else:
            model = ACModel(obss_preprocessor.obs_space, envs[0].action_space,
                            opt.image_dim, opt.memory_dim, opt.instr_dim,
                            not opt.no_instr, opt.instr_arch, not opt.no_mem,
                            opt.arch)
    model.train()
    if torch.cuda.is_available():
        model.cuda()

    trainer = ReinforcementTrainer(p_envs, opt, model, model_name,
                                   obss_preprocessor, reshape_reward, algo,
                                   opt.reasoning)

    # Start training
    trainer.train()
Ejemplo n.º 6
0
    def update_parameters(self):
        logs = super().update_parameters()
        '''logs = {
            "entropy":0,"value":0,"policy_loss":0,"value_loss":0,"grad_norm":0,"loss":0,"return_per_episode": [0],"reshaped_return_per_episode": [0],"num_frames_per_episode": [0],"num_frames": 0,"episodes_done": 0
        }'''

        self.update += 1

        if self.version == "v1":
            if self.update % self.update_frequency == 0 and self.update // self.update_frequency < 15:
                self.good_start_states = self.update_good_start_states(
                    self.good_start_states, self.random_walk_length,
                    self.transfer_ratio)
                self.env.update_good_start_states()
                for state in self.good_start_states[-3:]:
                    s1 = copy.copy(state)
                    s1.render()
                    input()

        elif self.version == "v2":
            logger = logging.getLogger(__name__)
            if self.update % self.update_frequency == 0 and self.update // self.update_frequency < self.curriculum_length:
                """self.env.print()
                print(sum([state.count for state in self.env.good_start_states])/len(self.env.good_start_states))"""
                self.env.update_good_start_states()
                logger.info('Start state Update Number {}/{}'.format(
                    self.update // self.update_frequency,
                    self.curriculum_length))

            if self.update % self.update_frequency == 0 and self.update // self.update_frequency == self.curriculum_length:
                logger.info('Start State Updates Done')
                self.env = ParallelEnv(
                    [gym.make(self.env_name) for _ in range(self.n_env)])

        elif self.version == "v3":
            if self.update % self.update_frequency == 0 and not self.curr_really_done:
                success_rate = np.mean(
                    [1 if r > 0 else 0 for r in logs['return_per_episode']])
                self.log_history.append(success_rate)
                logger = logging.getLogger(__name__)

                min_delta = 0.025
                patience = 1
                if self.es_method == 1:
                    bound = 0.9
                elif self.es_method == 2:
                    bound = 0.7 + (self.curr_update /
                                   self.curriculum_length) * (0.99 - 0.7)

                if not self.curr_done:
                    #if self.early_stopping_check(patience+(self.curr_update),min_delta):
                    if self.early_stopping_check(self.es_method, bound):
                        self.curr_update += 1
                        self.log_history = []
                        self.curr_done = self.env.update_good_start_states()
                        logger.info('Start state Update Number {}'.format(
                            self.curr_update))

                else:
                    if self.early_stopping_check(self.es_method, bound):
                        self.curr_update += 1
                        self.log_history = []
                        logger.info('Start State Updates Done')

                        self.env = ParallelEnv([
                            gym.make(self.env_name) for _ in range(self.n_env)
                        ])
                        self.curr_really_done = True

        #self.obs = self.env.reset()

        return logs
Ejemplo n.º 7
0
class RCPPOAlgo(PPOAlgo):
    """
    The class containing an application of Reverse Curriculum learning from
    https://arxiv.org/pdf/1707.05300.pdf to Proximal Policy Optimization
    """
    def __init__(self,
                 env_name,
                 n_env,
                 acmodel,
                 demo_loc,
                 version,
                 es_method=2,
                 update_frequency=10,
                 transfer_ratio=0.15,
                 random_walk_length=1,
                 curr_method='one',
                 num_frames_per_proc=None,
                 discount=0.99,
                 lr=7e-4,
                 beta1=0.9,
                 beta2=0.999,
                 gae_lambda=0.95,
                 entropy_coef=0.01,
                 value_loss_coef=0.5,
                 max_grad_norm=0.5,
                 recurrence=4,
                 adam_eps=1e-5,
                 clip_eps=0.2,
                 epochs=4,
                 batch_size=256,
                 preprocess_obss=None,
                 reshape_reward=None,
                 aux_info=None):

        self.n_env = n_env
        self.env_name = env_name
        self.transfer_ratio = transfer_ratio
        self.random_walk_length = random_walk_length
        self.version = version
        self.update_frequency = update_frequency
        self.es_method = es_method
        super().__init__([gym.make(env_name) for _ in range(n_env)], acmodel,
                         num_frames_per_proc, discount, lr, beta1, beta2,
                         gae_lambda, entropy_coef, value_loss_coef,
                         max_grad_norm, recurrence, adam_eps, clip_eps, epochs,
                         batch_size, preprocess_obss, reshape_reward, aux_info)

        if version == "v1":
            self.good_start_states = self.read_good_start_states(
                env_name, demo_loc)
        elif version == "v2" or version == "v3":
            self.read_good_start_states_v2(env_name, demo_loc, curr_method)
        self.env = None
        self.env = RCParallelEnv(self.env_name, self.n_env, demo_loc,
                                 curr_method)
        self.obs = self.env.reset()

        self.update = 0
        self.curr_update = 1
        self.log_history = []
        self.es_max = -1
        self.es_pat = 0
        self.curr_done = False
        self.curr_really_done = False

    def early_stopping_check(self, method, bound):
        '''
        if len(self.log_history) < patience:
            return False
        else:
            for i in range(patience-1):
                if self.log_history[-1-i]-self.log_history[-2-i] >= min_delta:
                    return False
            return True
        '''
        '''
        if len(self.log_history) ==0 :
            return False
        else:
            for i in range(patience):
                if self.log_history[-1-i] >= 0.9:
                    continue
                else:
                    return False
            return True
        '''
        if self.log_history[-1] >= bound:
            return True
        else:
            return False
        '''
        if self.log_history[-1] - self.es_max > min_delta:
            self.es_max = self.log_history[-1]
            self.es_pat = 0
            self.best_weights = self.acmodel.state_dict()
            ans = False
            no = 0
        else:
            self.es_pat += 1
            if self.es_pat >= patience:
                self.es_max = -1
                self.es_pat = 0
                self.acmodel.load_state_dict(self.best_weights)
                ans = True
                no = 1
            else:
                ans = False
                no = 1
        #print(ans,no,self.es_pat,patience)
        return ans
        '''

    def update_parameters(self):
        logs = super().update_parameters()
        '''logs = {
            "entropy":0,"value":0,"policy_loss":0,"value_loss":0,"grad_norm":0,"loss":0,"return_per_episode": [0],"reshaped_return_per_episode": [0],"num_frames_per_episode": [0],"num_frames": 0,"episodes_done": 0
        }'''

        self.update += 1

        if self.version == "v1":
            if self.update % self.update_frequency == 0 and self.update // self.update_frequency < 15:
                self.good_start_states = self.update_good_start_states(
                    self.good_start_states, self.random_walk_length,
                    self.transfer_ratio)
                self.env.update_good_start_states()
                for state in self.good_start_states[-3:]:
                    s1 = copy.copy(state)
                    s1.render()
                    input()

        elif self.version == "v2":
            logger = logging.getLogger(__name__)
            if self.update % self.update_frequency == 0 and self.update // self.update_frequency < self.curriculum_length:
                """self.env.print()
                print(sum([state.count for state in self.env.good_start_states])/len(self.env.good_start_states))"""
                self.env.update_good_start_states()
                logger.info('Start state Update Number {}/{}'.format(
                    self.update // self.update_frequency,
                    self.curriculum_length))

            if self.update % self.update_frequency == 0 and self.update // self.update_frequency == self.curriculum_length:
                logger.info('Start State Updates Done')
                self.env = ParallelEnv(
                    [gym.make(self.env_name) for _ in range(self.n_env)])

        elif self.version == "v3":
            if self.update % self.update_frequency == 0 and not self.curr_really_done:
                success_rate = np.mean(
                    [1 if r > 0 else 0 for r in logs['return_per_episode']])
                self.log_history.append(success_rate)
                logger = logging.getLogger(__name__)

                min_delta = 0.025
                patience = 1
                if self.es_method == 1:
                    bound = 0.9
                elif self.es_method == 2:
                    bound = 0.7 + (self.curr_update /
                                   self.curriculum_length) * (0.99 - 0.7)

                if not self.curr_done:
                    #if self.early_stopping_check(patience+(self.curr_update),min_delta):
                    if self.early_stopping_check(self.es_method, bound):
                        self.curr_update += 1
                        self.log_history = []
                        self.curr_done = self.env.update_good_start_states()
                        logger.info('Start state Update Number {}'.format(
                            self.curr_update))

                else:
                    if self.early_stopping_check(self.es_method, bound):
                        self.curr_update += 1
                        self.log_history = []
                        logger.info('Start State Updates Done')

                        self.env = ParallelEnv([
                            gym.make(self.env_name) for _ in range(self.n_env)
                        ])
                        self.curr_really_done = True

        #self.obs = self.env.reset()

        return logs

    def update_good_start_states(self, good_start_states, random_walk_length,
                                 transfer_ratio):
        new_starts = []
        #new_starts.extend(copy.deepcopy(self.good_start_states))

        #"""
        for state in good_start_states:
            s1 = state
            for i in range(random_walk_length):
                s1 = copy.deepcopy(s1)
                action = s1.action_space.sample()
                s1.step(action)
                s1.count += 1
                s1.step_count = 0
            new_starts.append(s1)
        """
        #n_threads = self.n_env
        n_threads = 64
        for start in range(0,len(self.good_start_states),n_threads):
            end = min(start+n_threads,len(self.good_start_states))
            
            good_start_states = ParallelEnv(self.good_start_states[start:end])
            for i in range(n_explore):
                action = [good_start_states.action_space.sample() for _ in range(len(good_start_states.envs))]
                good_start_states.step(action)
                new_starts.extend(copy.deepcopy(good_start_states.envs))
        """

        n_old = int(transfer_ratio * len(good_start_states))
        l = len(good_start_states)
        good_start_states = random.sample(good_start_states, n_old)
        good_start_states.extend(random.sample(new_starts, l - n_old))

        return good_start_states

    def read_good_start_states(self, env_name, demo_loc):
        demos = babyai.utils.demos.load_demos(demo_loc)

        seed = 0
        start_states = []

        for i, demo in enumerate(demos):
            actions = demo[3]

            env = gym.make(env_name)

            babyai.utils.seed(seed)

            env.seed(seed + i)
            env.reset()
            for j in range(len(actions) - 1):
                _, _, done, _ = env.step(actions[j].value)
            env.step_count = 0
            env.count = 1
            start_states.append(env)

        return start_states[:500]

    def read_good_start_states_v2(self, env_name, demo_loc, curr_method):
        demos = babyai.utils.demos.load_demos(demo_loc)

        seed = 0
        max_len = max([len(demo[3]) for demo in demos]) - 1
        self.pos = 0
        if curr_method == 'log':
            self.curriculum_length = math.floor(math.log2(max_len)) + 1
        else:
            combining_factor = int(curr_method)
            self.curriculum_length = math.ceil(max_len / combining_factor)

        return

        self.start_states = [[] for _ in range(max_len)]

        for i, demo in enumerate(demos):
            actions = demo[3]

            env = gym.make(env_name)
            env.seed(seed + i)
            env.reset()
            env.count = len(actions)

            n_steps = len(actions) - 1
            for j in range(max_len - 1, n_steps - 1, -1):
                self.start_states[j].append(copy.deepcopy(env))

            for j in range(n_steps):
                _, _, done, _ = env.step(actions[j].value)
                env.count -= 1
                env.step_count = 0
                self.start_states[n_steps - j - 1].append(copy.deepcopy(env))

    def update_good_start_states_v2(self):
        self.pos += 1
        new_starts = self.start_states[self.pos]

        l = len(self.good_start_states)
        n_old = int(self.transfer_ratio * l)
        good_start_states = random.sample(self.good_start_states, n_old)
        good_start_states.extend(random.sample(new_starts, l - n_old))

        return good_start_states
Ejemplo n.º 8
0
class BaseAlgo(ABC):
    """The base class for RL algorithms."""

    def __init__(self, envs, acmodel, num_frames_per_proc, discount, lr, gae_lambda, entropy_coef,
                 value_loss_coef, max_grad_norm, recurrence, preprocess_obss, reshape_reward, aux_info, reward_fn):
        """
        Initializes a `BaseAlgo` instance.

        Parameters:
        ----------
        envs : list
            a list of environments that will be run in parallel
        acmodel : torch.Module
            the model
        num_frames_per_proc : int
            the number of frames collected by every process for an update
        discount : float
            the discount for future s
        lr : float
            the learning rate for optimizers
        gae_lambda : float
            the lambda coefficient in the GAE formula
            ([Schulman et al., 2015](https://arxiv.org/abs/1506.02438))
        entropy_coef : float
            the weight of the entropy cost in the final objective
        value_loss_coef : float
            the weight of the value loss in the final objective
        max_grad_norm : float
            gradient will be clipped to be at most this value
        recurrence : int
            the number of steps the gradient is propagated back in time
        preprocess_obss : function
            a function that takes observations returned by the environment
            and converts them into the format that the model can handle
        reshape_reward : function
            a function that shapes the reward, takes an
            (observation, action, reward, done) tuple as an input
        aux_info : list
            a list of strings corresponding to the name of the extra information
            retrieved from the environment for supervised auxiliary losses
        reward_fn: str
            [babyai, cpv, both] -- The reward function to use to train the RL agent. 
        """
        # Store parameters

        self.env = ParallelEnv(envs)
        self.acmodel = acmodel
        self.acmodel.train()
        self.num_frames_per_proc = num_frames_per_proc
        self.discount = discount
        self.lr = lr
        self.gae_lambda = gae_lambda
        self.entropy_coef = entropy_coef
        self.value_loss_coef = value_loss_coef
        self.max_grad_norm = max_grad_norm
        self.recurrence = recurrence
        self.preprocess_obss = preprocess_obss or default_preprocess_obss
        self.reshape_reward = reshape_reward
        self.aux_info = aux_info
        self.reward_fn = reward_fn

        # Store helpers values

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.num_procs = len(envs)
        self.num_frames = self.num_frames_per_proc * self.num_procs

        assert self.num_frames_per_proc % self.recurrence == 0

        # Initialize experience values

        shape = (self.num_frames_per_proc, self.num_procs)

        self.obs = self.env.reset()
        self.obss = [None]*(shape[0])

        self.memory = torch.zeros(shape[1], self.acmodel.memory_size, device=self.device)
        self.memories = torch.zeros(*shape, self.acmodel.memory_size, device=self.device)

        self.mask = torch.ones(shape[1], device=self.device)
        self.masks = torch.zeros(*shape, device=self.device)
        self.actions = torch.zeros(*shape, device=self.device, dtype=torch.int)
        self.values = torch.zeros(*shape, device=self.device)
        self.rewards = torch.zeros(*shape, device=self.device)
        self.advantages = torch.zeros(*shape, device=self.device)
        self.log_probs = torch.zeros(*shape, device=self.device)

        if self.aux_info:
            self.aux_info_collector = ExtraInfoCollector(self.aux_info, shape, self.device)

        # Initialize log values

        self.log_episode_return = torch.zeros(self.num_procs, device=self.device)
        self.log_episode_reshaped_return = torch.zeros(self.num_procs, device=self.device)
        self.log_episode_num_frames = torch.zeros(self.num_procs, device=self.device)

        self.log_done_counter = 0
        self.log_return = [0] * self.num_procs
        self.log_reshaped_return = [0] * self.num_procs
        self.log_num_frames = [0] * self.num_procs

        # Store reward model
        if self.reward_fn == 'cpv' or self.reward_fn == 'both':
            self.reward_model = CPV(primed_model='babyai/rl/algos/models/cpv_model.pth')
    
            # Keep track of observations and mission so that we can compute cpv-based reward. 
            self.reset_cpv_buffer()

        self.all_rewards = [] # For calculating the std and mean of rewards

    def reset_cpv_buffer(self): 
        
        self.cpv_buffer = {
                'obs': [],
                'mission': [],
                'prev_reward': numpy.zeros((self.num_procs,))
        }

    def collect_experiences(self):
        """Collects rollouts and computes advantages.

        Runs several environments concurrently. The next actions are computed
        in a batch mode for all environments at the same time. The rollouts
        and advantages from all environments are concatenated together.

        Returns
        -------
        exps : DictList
            Cscripts/train_rl.py --env BabyAI-GoToLocal-v0ontains actions, rewards, advantages etc as attributes.
            Each attribute, e.g. `exps.reward` has a shape
            (self.num_frames_per_proc * num_envs, ...). k-th block
            of consecutive `self.num_frames_per_proc` frames contains
            data obtained from the k-th environment. Be careful not to mix
            data from different environments!
        logs : dict
            Useful stats about the training process, including the average
            reward, policy loss, value loss, etc.

        """

        # Reset cpv buffer if needed. 
        if self.reward_fn == 'cpv' or self.reward_fn == 'both':
            self.reset_cpv_buffer()

        start = time.time()

        for i in range(self.num_frames_per_proc):
            # Do one agent-environment interaction

            preprocessed_obs = self.preprocess_obss(self.obs, device=self.device)
            with torch.no_grad():
                model_results = self.acmodel(preprocessed_obs, self.memory * self.mask.unsqueeze(1))
                dist = model_results['dist']
                value = model_results['value']
                memory = model_results['memory']
                extra_predictions = model_results['extra_predictions']

            action = dist.sample()

            # Take a step in env and process reward if not using default reward function. 
            obs, old_reward, done, env_info = self.env.step(action.cpu().numpy())

            if self.reward_fn == 'cpv' or self.reward_fn == 'both': 

                reward = old_reward # TODO Do we even need this if-else block here anymore?
                """
                unnormalized_reward = self.reward_model.calculate_reward(self.cpv_buffer, self.obs)

                if self.aux_info:
                    env_info = self.aux_info_collector.process(env_info)
                    env_info = self.process_aux_info(env_info)

                std = numpy.std(self.all_rewards) if self.all_rewards != [] else numpy.std(unnormalized_reward)
                mean = numpy.mean(self.all_rewards) if self.all_rewards != [] else numpy.mean(unnormalized_reward)
                reward = numpy.clip([(r - mean) /  std for r in unnormalized_reward], 0, 1)
                self.all_rewards.extend(unnormalized_reward)
                if len(self.all_rewards) > 1000:
                    self.all_rewards[-1000:]
                """

            elif self.reward_fn == 'babyai': 
                reward = old_reward

                if self.aux_info: 
                    env_info = self.aux_info_collector.process(env_info)
                    #env_info = self.process_aux_info(env_info)

            # Update experiences values

            self.obss[i] = self.obs
            self.obs = obs

            self.memories[i] = self.memory
            self.memory = memory

            self.masks[i] = self.mask
            self.mask = 1 - torch.tensor(done, device=self.device, dtype=torch.float)
            self.actions[i] = action
            self.values[i] = value
            if self.reshape_reward is not None:
                self.rewards[i] = torch.tensor([
                    self.reshape_reward(obs_, action_, reward_, done_)
                    for obs_, action_, reward_, done_ in zip(obs, action, reward, done)
                ], device=self.device)
            else:
                self.rewards[i] = torch.tensor(reward, device=self.device)
            self.log_probs[i] = dist.log_prob(action)

            if self.aux_info:
                self.aux_info_collector.fill_dictionaries(i, env_info, extra_predictions)

            # Update log values
            self.log_episode_return += torch.tensor(reward, device=self.device, dtype=torch.float)
            self.log_episode_reshaped_return += self.rewards[i]
            self.log_episode_num_frames += torch.ones(self.num_procs, device=self.device)

            for i, done_ in enumerate(done):
                if done_:
                    self.log_done_counter += 1
                    self.log_return.append(self.log_episode_return[i].item())
                    self.log_reshaped_return.append(self.log_episode_reshaped_return[i].item())
                    self.log_num_frames.append(self.log_episode_num_frames[i].item())

            self.log_episode_return *= self.mask
            self.log_episode_reshaped_return *= self.mask
            self.log_episode_num_frames *= self.mask

        # If CPV, recompute reward based on trajectory. 
        if self.reward_fn == 'cpv': 

            # Make single run through CPV model to compute all rewards at once. 
            self.rewards = self.reward_model.calculate_reward(self.obss).permute(1,0)

            # TODO normalize rewards? 
            std, mean = torch.std_mean(self.rewards, dim=1)
            std = std.view(-1, 1).expand_as(self.rewards)
            mean = mean.view(-1, 1).expand_as(self.rewards)
            self.reward = torch.clamp((self.rewards - mean) /  std, 0.0, 1.0)

        # Add advantage and return to experiences
        preprocessed_obs = self.preprocess_obss(self.obs, device=self.device)
        with torch.no_grad():
            next_value = self.acmodel(preprocessed_obs, self.memory * self.mask.unsqueeze(1))['value']

        for i in reversed(range(self.num_frames_per_proc)):
            next_mask = self.masks[i+1] if i < self.num_frames_per_proc - 1 else self.mask
            next_value = self.values[i+1] if i < self.num_frames_per_proc - 1 else next_value
            next_advantage = self.advantages[i+1] if i < self.num_frames_per_proc - 1 else 0

            # print("Reward")
            # print(self.rewards[i])
            # # print("Discount")
            # # print(self.discount[i])
            # print("Values")
            # print(self.values[i])

            delta = self.rewards[i] + self.discount * next_value * next_mask - self.values[i]

            self.advantages[i] = delta + self.discount * self.gae_lambda * next_advantage * next_mask

        # Flatten the data correctly, making sure that
        # each episode's data is a continuous chunk

        exps = DictList()
        exps.obs = [self.obss[i][j]
                    for j in range(self.num_procs)
                    for i in range(self.num_frames_per_proc)]
        # In commments below T is self.num_frames_per_proc, P is self.num_procs,
        # D is the dimensionality

        # T x P x D -> P x T x D -> (P * T) x D
        exps.memory = self.memories.transpose(0, 1).reshape(-1, *self.memories.shape[2:])
        # T x P -> P x T -> (P * T) x 1
        exps.mask = self.masks.transpose(0, 1).reshape(-1).unsqueeze(1)

        # for all tensors below, T x P -> P x T -> P * T
        exps.action = self.actions.transpose(0, 1).reshape(-1)
        exps.value = self.values.transpose(0, 1).reshape(-1)
        exps.reward = self.rewards.transpose(0, 1).reshape(-1)
        exps.advantage = self.advantages.transpose(0, 1).reshape(-1)
        exps.returnn = exps.value + exps.advantage
        exps.log_prob = self.log_probs.transpose(0, 1).reshape(-1)

        if self.aux_info:
            exps = self.aux_info_collector.end_collection(exps)

        # Preprocess experiences

        exps.obs = self.preprocess_obss(exps.obs, device=self.device)

        # Log some values

        keep = max(self.log_done_counter, self.num_procs)

        log = {
            "return_per_episode": self.log_return[-keep:],
            "reshaped_return_per_episode": self.log_reshaped_return[-keep:],
            "num_frames_per_episode": self.log_num_frames[-keep:],
            "num_frames": self.num_frames,
            "episodes_done": self.log_done_counter,
        }

        self.log_done_counter = 0
        self.log_return = self.log_return[-self.num_procs:]
        self.log_reshaped_return = self.log_reshaped_return[-self.num_procs:]
        self.log_num_frames = self.log_num_frames[-self.num_procs:]

        return exps, log

    @abstractmethod
    def update_parameters(self):
        pass