Esempio n. 1
0
 def run(self):
     mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
     mb_states = self.states
     for n in range(self.nsteps):
         actions, values, states, _ = self.model.step(self.obs, self.states, self.dones)
         mb_obs.append(np.copy(self.obs))
         mb_actions.append(actions)
         mb_values.append(values)
         mb_dones.append(self.dones)
         obs, rewards, dones, _ = self.env.step(actions)
         if self.render:
             self.env.render()
         if self.record:
             frame = obs[0,:,:,:3]
             self.recording.append(frame)
         self.states = states
         self.dones = dones
         for n, done in enumerate(dones):
             if done:
                 self.obs[n] = self.obs[n]*0
         self.obs = obs
         mb_rewards.append(rewards)
     mb_dones.append(self.dones)
     #batch of steps to batch of rollouts
     mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape)
     mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
     mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
     mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
     mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
     mb_masks = mb_dones[:, :-1]
     mb_dones = mb_dones[:, 1:]
     last_values = self.model.value(self.obs, self.states, self.dones).tolist()
     #discount/bootstrap off value fn
     for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
         rewards = rewards.tolist()
         dones = dones.tolist()
         # Iterate rewards step-by-step and add to final scores if done
         for i in range(self.nsteps):
             # Add reward to episode reward
             self.episode_rewards[n] += rewards[i]
             if dones[i] == 1:
                 # Add final result to episode rewards
                 self.final_rewards.append(self.episode_rewards[n])
                 # Reset local episode reward
                 self.episode_rewards[n] = 0
                 #Save current game as a video
                 if self.record:
                     self.makevideo()
         # Discount rewards
         if dones[-1] == 0:
             rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1]
         else:
             rewards = discount_with_dones(rewards, dones, self.gamma)
         mb_rewards[n] = rewards
     mb_rewards = mb_rewards.flatten()
     mb_actions = mb_actions.flatten()
     mb_values = mb_values.flatten()
     mb_masks = mb_masks.flatten()
     return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
Esempio n. 2
0
    def run(self):
        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
        mb_states = self.states
        for n in range(self.nsteps):
            actions, values, states, _ = self.model.step(self.obs,
                                                         S=self.states,
                                                         M=self.dones)
            mb_obs.append(np.copy(self.obs))
            mb_actions.append(actions)
            mb_values.append(values)
            mb_dones.append(self.dones)
            obs, rewards, dones, _ = self.env.step(actions)
            # TODO: surrogate reward
            self.states = states
            self.dones = dones
            for n, done in enumerate(dones):
                if done:
                    self.obs[n] = self.obs[n] * 0
            self.obs = obs
            mb_rewards.append(rewards)
        mb_dones.append(self.dones)
        #batch of steps to batch of rollouts

        mb_obs = np.asarray(mb_obs, dtype=self.ob_dtype).swapaxes(
            1, 0).reshape(self.batch_ob_shape)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
        mb_actions = np.asarray(
            mb_actions,
            dtype=self.model.train_model.action.dtype.name).swapaxes(1, 0)
        mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
        mb_masks = mb_dones[:, :-1]
        mb_dones = mb_dones[:, 1:]

        if self.gamma > 0.0:
            #discount/bootstrap off value fn
            last_values = self.model.value(self.obs,
                                           S=self.states,
                                           M=self.dones).tolist()
            for n, (rewards, dones,
                    value) in enumerate(zip(mb_rewards, mb_dones,
                                            last_values)):
                rewards = rewards.tolist()
                dones = dones.tolist()
                if dones[-1] == 0:
                    rewards = discount_with_dones(rewards + [value],
                                                  dones + [0], self.gamma)[:-1]
                else:
                    rewards = discount_with_dones(rewards, dones, self.gamma)

                mb_rewards[n] = rewards

        mb_actions = mb_actions.reshape(self.batch_action_shape)

        mb_rewards = mb_rewards.flatten()
        mb_values = mb_values.flatten()
        mb_masks = mb_masks.flatten()
        return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
Esempio n. 3
0
    def run( self ):
        # We initialize the lists that will contain the mb of experiences
        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [], [], [], [], []
        mb_states   = self.states
        epinfos     = []

        for n in range(self.nsteps):
            # Given observations, take action and value (V(s))
            # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init
            actions, values, states, _ = self.model.step( self.obs, S=self.states, M=self.dones )

            # Append the experiences
            mb_obs.append(np.copy(self.obs))
            mb_actions.append(actions)
            mb_values.append(values)
            mb_dones.append(self.dones)

            # Take actions in env and look the results
            obs, rewards, dones, infos = self.env.step(actions)
            for info in infos:
                maybeepinfo = info.get('episode')
                if maybeepinfo: epinfos.append(maybeepinfo)
            self.states = states
            self.dones  = dones
            self.obs    = obs
            mb_rewards.append(rewards)

        mb_dones.append(self.dones)

        # Batch of steps to batch of rollouts
        mb_obs      = np.asarray(mb_obs, dtype=self.ob_dtype).swapaxes(1, 0).reshape(self.batch_ob_shape)
        mb_rewards  = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
        mb_actions  = np.asarray(mb_actions, dtype=self.model.train_model.action.dtype.name).swapaxes(1, 0)
        mb_values   = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
        mb_dones    = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
        mb_masks    = mb_dones[:, :-1]
        mb_dones    = mb_dones[:, 1:]


        if self.gamma > 0.0:
            # Discount/bootstrap off value fn
            last_values = self.model.value(self.obs, S=self.states, M=self.dones).tolist()
            for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
                rewards = rewards.tolist()
                dones = dones.tolist()
                if dones[-1] == 0:
                    rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1]
                else:
                    rewards = discount_with_dones(rewards, dones, self.gamma)

                mb_rewards[n] = rewards

        mb_actions  = mb_actions.reshape(self.batch_action_shape)

        mb_rewards  = mb_rewards.flatten()
        mb_values   = mb_values.flatten()
        mb_masks    = mb_masks.flatten()
        return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values, epinfos
Esempio n. 4
0
 def run(self):
     mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_raw_rewards = [], [], [], [], [], []
     mb_states = self.states
     for n in range(self.nsteps):
         actions, values, states, _ = self.model.step(
             self.obs, self.states, self.dones)
         mb_obs.append(np.copy(self.obs))
         mb_actions.append(actions)
         mb_values.append(values)
         mb_dones.append(self.dones)
         #print('actions:', actions)
         #obs_all, raw_rewards, dones, _ = self.env.step(actions)
         obs_all, raw_rewards, dones, _ = self.env.step(actions)
         obs = [obs_index['image'] for obs_index in obs_all]
         obs = np.asarray(obs)
         rewards = raw_rewards
         self.states = states
         self.dones = dones
         if hasattr(self.model, 'sil'):
             self.model.sil.step(self.obs, actions, raw_rewards, dones)
         for n, done in enumerate(dones):
             if done:
                 self.obs[n] = self.obs[n] * 0
         self.obs = obs
         mb_rewards.append(rewards)
         mb_raw_rewards.append(raw_rewards)
     mb_dones.append(self.dones)
     # batch of steps to batch of rollouts
     mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(
         1, 0).reshape(self.batch_ob_shape)
     mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
     mb_raw_rewards = np.asarray(
         mb_raw_rewards, dtype=np.float32).swapaxes(1, 0)
     mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
     mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
     mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
     mb_masks = mb_dones[:, :-1]
     mb_dones = mb_dones[:, 1:]
     last_values = self.model.value(
         self.obs, self.states, self.dones).tolist()
     # discount/bootstrap off value fn
     for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
         rewards = rewards.tolist()
         dones = dones.tolist()
         if dones[-1] == 0:
             rewards = discount_with_dones(
                 rewards + [value], dones + [0], self.gamma)[:-1]
         else:
             rewards = discount_with_dones(rewards, dones, self.gamma)
         mb_rewards[n] = rewards
     mb_rewards = mb_rewards.flatten()
     mb_raw_rewards = mb_raw_rewards.flatten()
     #print('mb_rewards:', mb_rewards.shape)
     #print('mb_raw_rewards:', mb_raw_rewards.shape)
     mb_actions = mb_actions.flatten()
     mb_values = mb_values.flatten()
     mb_masks = mb_masks.flatten()
     return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values, mb_raw_rewards
Esempio n. 5
0
    def run(self):
        # We initialize the lists that will contain the mb of experiences
        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
        mb_states = self.states
        epinfos = []
        for n in range(self.nsteps):
            # Given observations, take action and value (V(s))
            # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init
            actions, values, states, _ = self.model.step(self.obs, S=self.states, M=self.dones)

            # Append the experiences
            mb_obs.append(np.copy(self.obs))
            mb_actions.append(actions)
            mb_values.append(values)
            mb_dones.append(self.dones)

            # Take actions in env and look the results
            obs, rewards, dones, infos = self.env.step(actions)
            for info in infos:
                maybeepinfo = info.get('episode')
                if maybeepinfo: epinfos.append(maybeepinfo)
            self.states = states
            self.dones = dones
            self.obs = obs
            mb_rewards.append(rewards)
        mb_dones.append(self.dones)

        # Batch of steps to batch of rollouts
        mb_obs = np.asarray(mb_obs, dtype=self.ob_dtype).swapaxes(1, 0).reshape(self.batch_ob_shape)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
        mb_actions = np.asarray(mb_actions, dtype=self.model.train_model.action.dtype.name).swapaxes(1, 0)
        mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
        mb_masks = mb_dones[:, :-1]
        mb_dones = mb_dones[:, 1:]


        if self.gamma > 0.0:
            # Discount/bootstrap off value fn
            last_values = self.model.value(self.obs, S=self.states, M=self.dones).tolist()
            for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
                rewards = rewards.tolist()
                dones = dones.tolist()
                if dones[-1] == 0:
                    rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1]
                else:
                    rewards = discount_with_dones(rewards, dones, self.gamma)

                mb_rewards[n] = rewards

        mb_actions = mb_actions.reshape(self.batch_action_shape)

        mb_rewards = mb_rewards.flatten()
        mb_values = mb_values.flatten()
        mb_masks = mb_masks.flatten()
        return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values, epinfos
Esempio n. 6
0
    def run(self):
        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
        mb_states = self.states

        epinfos = []

        for n in range(self.nsteps):
            actions, values, states, _ = self.model.step(
                self.obs, self.states, self.dones)
            mb_obs.append(np.copy(self.obs))
            mb_actions.append(actions)
            mb_values.append(values)
            mb_dones.append(self.dones)
            obs, rewards, dones, infos = self.env.step(actions)

            for info in infos:
                maybeepinfo = info.get('episode')
                if maybeepinfo: epinfos.append(maybeepinfo)

            self.states = states
            self.dones = dones
            for n, done in enumerate(dones):
                if done:
                    self.obs[n] = self.obs[n] * 0
            self.obs = obs
            mb_rewards.append(rewards)
        mb_dones.append(self.dones)
        #batch of steps to batch of rollouts
        mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(
            self.batch_ob_shape)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
        mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
        mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
        mb_masks = mb_dones[:, :-1]
        mb_dones = mb_dones[:, 1:]
        last_values = self.model.value(self.obs, self.states,
                                       self.dones).tolist()
        #discount/bootstrap off value fn
        for n, (rewards, dones,
                value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
            rewards = rewards.tolist()
            dones = dones.tolist()
            if dones[-1] == 0:
                rewards = discount_with_dones(rewards + [value], dones + [0],
                                              self.gamma)[:-1]
            else:
                rewards = discount_with_dones(rewards, dones, self.gamma)
            mb_rewards[n] = rewards
        mb_rewards = mb_rewards.flatten()
        mb_actions = mb_actions.flatten()
        mb_values = mb_values.flatten()
        mb_masks = mb_masks.flatten()
        return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values, epinfos
Esempio n. 7
0
    def run(self):
        """
        Run a learning step of the model

        :return: ([float], [float], [float], [bool], [float], [float])
                 observations, states, rewards, masks, actions, values
        """
        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [], [], [], [], []
        mb_states = self.states
        for _ in range(self.n_steps):
            actions, values, states, _ = self.model.step(
                self.obs, self.states, self.dones)
            mb_obs.append(np.copy(self.obs))
            mb_actions.append(actions)
            mb_values.append(values)
            mb_dones.append(self.dones)
            obs, rewards, dones, _ = self.env.step(actions)
            self.states = states
            self.dones = dones
            for n, done in enumerate(dones):
                if done:
                    self.obs[n] = self.obs[n] * 0
            self.obs = obs
            mb_rewards.append(rewards)
        mb_dones.append(self.dones)
        # batch of steps to batch of rollouts
        mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(
            self.batch_ob_shape)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
        mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
        mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
        mb_masks = mb_dones[:, :-1]
        mb_dones = mb_dones[:, 1:]
        last_values = self.model.value(self.obs, self.states,
                                       self.dones).tolist()
        # discount/bootstrap off value fn
        for n, (rewards, dones,
                value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
            rewards = rewards.tolist()
            dones = dones.tolist()
            if dones[-1] == 0:
                rewards = discount_with_dones(rewards + [value], dones + [0],
                                              self.gamma)[:-1]
            else:
                rewards = discount_with_dones(rewards, dones, self.gamma)
            mb_rewards[n] = rewards
        mb_rewards = mb_rewards.flatten()
        mb_actions = mb_actions.flatten()
        mb_values = mb_values.flatten()
        mb_masks = mb_masks.flatten()
        return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
Esempio n. 8
0
 def run(self):
     mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
     mb_states = self.states
     for n in range(self.nsteps):
         actions, values, states = self.model.step(self.obs, self.states,
                                                   self.dones)
         mb_obs.append(np.copy(self.obs))
         mb_actions.append(actions)
         mb_values.append(values)
         mb_dones.append(self.dones)
         obs, rewards, dones, _ = self.env.step(actions)
         self.states = states
         self.dones = dones
         for n, done in enumerate(dones):
             if done:
                 self.obs[n] = self.obs[n] * 0
         self.update_obs(obs)
         mb_rewards.append(rewards)
     mb_dones.append(self.dones)
     #batch of steps to batch of rollouts
     mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(
         self.batch_ob_shape)
     mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
     mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
     mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
     mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
     mb_masks = mb_dones[:, :-1]
     mb_dones = mb_dones[:, 1:]
     last_values = self.model.value(self.obs, self.states,
                                    self.dones).tolist()
     #discount/bootstrap off value fn
     for n, (rewards, dones,
             value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
         rewards = rewards.tolist()
         dones = dones.tolist()
         if dones[-1] == 0:
             rewards = discount_with_dones(rewards + [value], dones + [0],
                                           self.gamma)[:-1]
         else:
             rewards = discount_with_dones(rewards, dones, self.gamma)
             episode_values = mb_values[:, n]
             episode_rewards = mb_rewards[:, n]
             mean_value_error = np.absolute(
                 np.subtract(episode_values, episode_rewards))
             self.env.report(mean_value_error)
         mb_rewards[n] = rewards
     mb_rewards = mb_rewards.flatten()
     mb_actions = mb_actions.flatten()
     mb_values = mb_values.flatten()
     mb_masks = mb_masks.flatten()
     return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
Esempio n. 9
0
 def run(self, verbose=False):
     """
     Get batchwise data for training
     """
     mb_obs, mb_rewards, mb_actions, mb_values, mb_dones,mb_states = [],[],[],[],[],[]
     for n in range(self.p.N_STEPS):
         actions, values, _ = self.model.step(self.obs, self.states)
         mb_obs.append(np.copy(self.obs))
         mb_states.append(np.copy(self.states))
         mb_actions.append(actions)
         mb_values.append(values)
         mb_dones.append(self.dones)
         obs, rewards, dones, states = self.env.step(actions,
                                                     verbose=verbose)
         self.dones = dones
         for n, done in enumerate(dones):
             if done:
                 self.obs[n] = self.obs[n] * 0
         self.obs = obs
         self.states = states
         mb_rewards.append(rewards)
     mb_dones.append(self.dones)
     #batch of steps to batch of rollouts
     mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(
         self.p.BATCH_OBS_SHAPE)
     mb_states = np.asarray(mb_states, dtype=np.float).swapaxes(
         1, 0).reshape(self.p.BATCH_STATE_SHAPE)
     mb_rewards = np.asarray(mb_rewards, dtype=np.float).swapaxes(1, 0)
     mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
     mb_values = np.asarray(mb_values, dtype=np.float).swapaxes(1, 0)
     mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
     mb_masks = mb_dones[:, :-1]
     mb_dones = mb_dones[:, 1:]
     last_values = self.model.value(self.obs, self.states,
                                    self.dones).tolist()
     #discount/bootstrap off value fn
     for n, (rewards, dones,
             value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
         rewards = rewards.tolist()
         dones = np.array(dones).astype('int32').tolist()
         if dones[-1] == 0:
             rewards = discount_with_dones(rewards + [value], dones + [0],
                                           self.p.GAMMA_DF)[:-1]
         else:
             rewards = discount_with_dones(rewards, dones, self.p.GAMMA_DF)
         mb_rewards[n] = rewards
     mb_rewards = mb_rewards.flatten()
     mb_actions = mb_actions.flatten()
     mb_values = mb_values.flatten()
     mb_masks = mb_masks.flatten()
     return mb_obs, mb_rewards, mb_masks, mb_actions, mb_values, mb_states
Esempio n. 10
0
    def discount_reward(self, gamma, last_values):
        if gamma > 0.0:
            # Discount/bootstrap off value fn
            for n, (rewards, dones,
                    value) in enumerate(zip(self.r, self.dones, last_values)):
                rewards = rewards.tolist()
                dones = dones.tolist()
                if dones[-1] == 0:
                    rewards = discount_with_dones(rewards + [value],
                                                  dones + [0], gamma)[:-1]
                else:
                    rewards = discount_with_dones(rewards, dones, gamma)

                self.r[n] = rewards
 def run(self):
     mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
     mb_states = self.states
     for n in range(self.nsteps):
         actions, values, states = self.model.step(self.stochastic,
                                                   self.obs, self.states,
                                                   self.dones)
         mb_obs.append(np.copy(self.obs))
         mb_actions.append(actions)
         mb_values.append(values)
         mb_dones.append(self.dones)
         obs, rewards, dones, _ = self.env.step(actions)
         self.states = states
         self.dones = dones
         for i, done in enumerate(dones):
             if done:
                 self.obs[i] = 0
         self.update_obs(obs)
         mb_rewards.append(rewards)
     mb_dones.append(self.dones)
     #batch of steps to batch of rollouts
     mb_obs = np.asarray(mb_obs, dtype=self.ob_dtype).swapaxes(1, 0)
     mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
     mb_actions = np.asarray(mb_actions, dtype=self.ac_dtype).swapaxes(1, 0)
     mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
     mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
     mb_masks = mb_dones[:, :-1]
     mb_dones = mb_dones[:, 1:]
     last_values = self.model.value(self.obs, self.states,
                                    self.dones).tolist()
     raw_rewards = mb_rewards.flatten()
     #discount/bootstrap off value fn
     for n, (rewards, dones,
             value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
         rewards = rewards.tolist()
         dones = dones.tolist()
         if dones[-1] == 0:
             rewards = discount_with_dones(rewards + [value], dones + [0],
                                           self.gamma)[:-1]
         else:
             rewards = discount_with_dones(rewards, dones, self.gamma)
         mb_rewards[n] = rewards
     mb_obs = mb_obs.reshape([-1] + list(mb_obs.shape[2:]))
     mb_returns = mb_rewards.flatten()  # because it contains returns
     mb_actions = np.reshape(mb_actions, [-1] + list(mb_actions.shape[2:]))
     mb_values = mb_values.flatten()
     mb_masks = mb_masks.flatten()
     return mb_obs, mb_states, raw_rewards, mb_returns, mb_masks, mb_actions, mb_values
Esempio n. 12
0
    def add_episode(self, trajectory):
        obs = []
        actions = []
        rewards = []
        dones = []

        if self.stack > 1:
            ob_shape = list(trajectory[0][0].shape)
            nc = ob_shape[-1]
            ob_shape[-1] = nc * self.stack
            stacked_ob = np.zeros(ob_shape, dtype=trajectory[0][0].dtype)
        for (ob, action, reward) in trajectory:
            if ob is not None:
                x = self.fn_obs(ob) if self.fn_obs is not None else ob
                if self.stack > 1:
                    stacked_ob = np.roll(stacked_ob, shift=-nc, axis=2)
                    stacked_ob[:, :, -nc:] = x
                    obs.append(stacked_ob)
                else:
                    obs.append(x)
            else:
                obs.append(None)
            actions.append(action)
            rewards.append(
                self.fn_reward(reward)
            ) if self.fn_reward is not None else rewards.append(reward)
            dones.append(False)
        dones[len(dones) - 1] = True
        returns = discount_with_dones(rewards, dones, self.gamma)
        for (ob, action, R) in list(zip(obs, actions, returns)):
            self.buffer.add(ob, action, R)
Esempio n. 13
0
 def run(self):
     mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
     mb_states = self.states
     for n in range(self.nsteps):  #run tmax steps
         actions, values, states = self.model.step(self.obs, self.states,
                                                   self.dones)
         mb_obs.append(np.copy(self.obs))
         mb_actions.append(actions)
         mb_values.append(values)
         mb_dones.append(self.dones)
         obs, rewards, dones, _ = self.env.step(
             actions)  #Next obs after taking the step
         self.states = states
         self.dones = dones
         for n, done in enumerate(dones):
             if done:
                 self.obs[n] = self.obs[n] * 0  #reset
         self.update_obs(obs)  #Roll the obs. keeps latest 4 frames
         mb_rewards.append(rewards)
     mb_dones.append(self.dones)
     #batch of steps to batch of rollouts
     mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(
         self.batch_ob_shape)
     mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
     mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
     mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
     mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
     mb_masks = mb_dones[:, :-1]
     mb_dones = mb_dones[:, 1:]
     last_values = self.model.value(self.obs, self.states,
                                    self.dones).tolist()
     #discount/bootstrap off value fn
     for n, (rewards, dones,
             value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
         rewards = rewards.tolist()
         dones = dones.tolist()
         if dones[-1] == 0:
             rewards = discount_with_dones(rewards + [value], dones + [0],
                                           self.gamma)[:-1]
         else:
             rewards = discount_with_dones(rewards, dones, self.gamma)
         mb_rewards[n] = rewards
     mb_rewards = mb_rewards.flatten()
     mb_actions = mb_actions.flatten()
     mb_values = mb_values.flatten()
     mb_masks = mb_masks.flatten()
     return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
Esempio n. 14
0
 def run(self):
     mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
     mb_states = self.states
     for n in range(self.nsteps):
         actions, values, states = self.model.step(self.obs, self.states, self.dones)
         mb_obs.append(np.copy(self.obs))
         mb_actions.append(actions)
         mb_values.append(values)
         mb_dones.append(self.dones)
         obs, rewards, dones, _ = self.env.step(actions)
         self.states = states
         self.dones = dones
         for n, done in enumerate(dones):
             if done:
                 self.obs[n] = self.obs[n]*0
         self.update_obs(obs)
         mb_rewards.append(rewards)
     mb_dones.append(self.dones)
     #batch of steps to batch of rollouts
     mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape)
     mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
     mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
     mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
     mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
     mb_masks = mb_dones[:, :-1]
     mb_dones = mb_dones[:, 1:]
     last_values = self.model.value(self.obs, self.states, self.dones).tolist()
     #discount/bootstrap off value fn
     for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
         rewards = rewards.tolist()
         dones = dones.tolist()
         if dones[-1] == 0:
             rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1]
         else:
             rewards = discount_with_dones(rewards, dones, self.gamma)
         mb_rewards[n] = rewards
     mb_rewards = mb_rewards.flatten()
     mb_actions = mb_actions.flatten()
     mb_values = mb_values.flatten()
     mb_masks = mb_masks.flatten()
     return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
Esempio n. 15
0
    def run(self):
        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [],[],[],[],[],[]
        mb_batchactions = []
        mb_states = self.states
        epinfos = []
        for _ in range(self.nsteps):
            actions, values, self.states, neglogpacs, batchactions = self.model.step(
                self.obs, self.states, self.dones)
            mb_obs.append(self.obs.copy())
            mb_actions.append(actions)
            mb_values.append(values)
            mb_neglogpacs.append(neglogpacs)
            mb_dones.append(self.dones)
            mb_batchactions.append(batchactions)
            self.obs[:], rewards, self.dones, infos = self.env.step(actions)
            for info in infos:
                maybeepinfo = info.get('episode')

                if maybeepinfo: epinfos.append(maybeepinfo)
            mb_rewards.append(rewards)
        #batch of steps to batch of rollouts
        mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
        mb_actions = np.asarray(mb_actions)
        mb_values = np.asarray(mb_values, dtype=np.float32)
        mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32)
        mb_dones = np.asarray(mb_dones, dtype=np.bool)
        mb_batchactions = np.asarray(mb_batchactions, dtype=np.float32)
        last_values = self.model.value(self.obs, self.states, self.dones)
        #discount/bootstrap off value fn
        if self.gae is True:
            mb_returns = np.zeros_like(mb_rewards)
            mb_advs = np.zeros_like(mb_rewards)
            lastgaelam = 0
            for t in reversed(range(self.nsteps)):
                if t == self.nsteps - 1:
                    nextnonterminal = 1.0 - self.dones
                    nextvalues = last_values
                else:
                    nextnonterminal = 1.0 - mb_dones[t + 1]
                    nextvalues = mb_values[t + 1]
                delta = mb_rewards[
                    t] + self.gamma * nextvalues * nextnonterminal - mb_values[
                        t]
                mb_advs[
                    t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam
            mb_returns = mb_advs + mb_values
        else:
            mb_returns = discount_with_dones(mb_rewards, mb_dones, self.gamma)
            mb_returns = np.array(mb_returns)
        return (*map(sf01,
                     (mb_obs, mb_returns, mb_dones, mb_actions, mb_values,
                      mb_neglogpacs, mb_batchactions)), mb_states, epinfos)
Esempio n. 16
0
def learn_hierarchical(env, batch_size, total_epoches, gamma, g_ob_size,
                       g_act_size, g_latents, g_lr, g_ac, l_ob_size,
                       l_act_size, l_latents, l_lr, l_ac):
    """
    Learn a hierarchical model
    """

    model = Hierarchical(g_ob_size, g_act_size, g_latents, g_lr, g_ac,
                         l_ob_size, l_act_size, l_latents, l_lr, l_ac)

    mb_obs, mb_acts, mb_rews, mb_vals, mb_dones = [], [], [], [], []

    tqdm.write('training hierarchical model')

    for ep in tqdm(range(total_epoches)):
        obs = env.reset(model.local_model,
                        model.predictor)  # initial observation
        done = False

        while not done:
            action = model.step(obs)  # RL choose action based on observation
            value = model.value(obs)
            action = int(action)
            value = int(value)
            mb_obs.append(obs)
            mb_acts.append(action)
            mb_vals.append(value)

            obs_, reward, done, info = env.step(action)
            mb_rews.append(reward)
            mb_dones.append(done)

            obs = obs_

            if ep % batch_size == 0:
                mb_rews = discount_with_dones(mb_rews, mb_dones, gamma)
                model.global_model.train(np.array(mb_obs), np.array(mb_acts),
                                         np.array(mb_rews), np.array(mb_vals))
                mb_obs, mb_acts, mb_rews, mb_vals, mb_dones = [], [], [], [], []

    return model
Esempio n. 17
0
    def run(self):
        # curiosity = True
        # curiosity = False

        # We initialize the lists that will contain the mb of experiences
        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_next_states = [],[],[],[],[],[]
        mb_states = self.states
        icm_testing_rewards = []
        for n in range(self.nsteps):
            # Given observations, take action and value (V(s))
            # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init
            actions, values, states, _ = self.model.step(self.obs,
                                                         S=self.states,
                                                         M=self.dones)

            # Append the experiences
            mb_obs.append(np.copy(self.obs))
            mb_actions.append(actions)
            mb_values.append(values)
            mb_dones.append(self.dones)

            if self.curiosity == True:
                icm_states = self.obs

            # Take actions in env and look the results
            obs, rewards, dones, _ = self.env.step(actions)
            # print("received Rewards from step function ")

            # print("received Rewards ",rewards)
            if self.curiosity == True:
                icm_next_states = obs

                icm_rewards = self.icm.calculate_intrinsic_reward(
                    icm_states, icm_next_states, actions)
                # print("shape of icm rewards ",np.shape(icm_rewards))
                icm_testing_rewards.append(icm_rewards)
                # icm_rewards = [icm_rewards] * len(rewards)

                # icm_rewards = icm_rewards * 2
                # print("intrinsic Reward : ",icm_rewards)

                # icm_rewards = np.clip(icm_rewards,-constants['REWARD_CLIP'], constants['REWARD_CLIP'])

                # print("icm _ rewards : ",icm_rewards)

                # rewards = icm_rewards  + rewards
                # print("Rewards icm {} , commulative reward {} ".format(icm_rewards , rewards))

                # rewards = np.clip(rewards,-constants['REWARD_CLIP'], +constants['REWARD_CLIP'])
                # print("icm rewards ", rewards)

                # print("calculated rewards ",rewards)

            mb_next_states.append(np.copy(obs))
            self.states = states
            self.dones = dones
            for n, done in enumerate(dones):
                if done:
                    self.obs[n] = self.obs[n] * 0
            self.obs = obs
            mb_rewards.append(rewards)
        mb_dones.append(self.dones)

        # Batch of steps to batch of rollouts
        mb_obs = np.asarray(mb_obs, dtype=self.ob_dtype).swapaxes(
            1, 0).reshape(self.batch_ob_shape)
        mb_next_states = np.asarray(mb_next_states,
                                    dtype=self.ob_dtype).swapaxes(
                                        1, 0).reshape(self.batch_ob_shape)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
        # > testing mean std of rewards
        if self.curiosity:
            icm_testing_rewards = np.asarray(icm_testing_rewards,
                                             dtype=np.float32).swapaxes(1, 0)
            # print("Icm rewards" ,icm_testing_rewards)
        # > testing mean std of rewards
        mb_actions = np.asarray(
            mb_actions,
            dtype=self.model.train_model.action.dtype.name).swapaxes(1, 0)
        mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
        mb_masks = mb_dones[:, :-1]
        mb_dones = mb_dones[:, 1:]

        # > passing reward to reward forward filter

        # print("Merged things obs {} rewards {} actions {} dones {}".
        # format(np.shape(mb_obs) , np.shape(mb_rewards) , np.shape(mb_actions) , np.shape(mb_dones)))

        # >
        # rffs = np.array([self.rff.update(rew) for rew in mb_rewards.T])

        if self.curiosity == True:
            rffs = np.array(
                [self.rff.update(rew) for rew in icm_testing_rewards.T])
            rffs_mean, rffs_std, rffs_count = mpi_moments(rffs.ravel())
            self.rff_rms.update_from_moments(rffs_mean, rffs_std**2,
                                             rffs_count)
            rews = icm_testing_rewards / np.sqrt(self.rff_rms.var)

            # mb_rewards = rews

            mb_rewards = mb_rewards + rews

            # now clipping the reward (-1,1)

            # mb_rewards = np.clip(mb_rewards,-constants['REWARD_CLIP'], constants['REWARD_CLIP'])
            # print(mb_rewards)
            # print(" shape of normalized reward ", np.shape(rews))

            # icm_testing_rewards = (icm_testing_rewards >  rffs_mean).astype(np.float32)
            # np.place(icm_testing_rewards, icm_testing_rewards > 0, 0.2)

        # np.interp(icm_testing_rewards.ravel() , (rffs_mean+)  , ())

        # icm_testing_rewards = icm_testing_rewards.ravel()
        # print("\n\nIcm Rewards : ",icm_testing_rewards)

        # print(" icm testing rewards ")
        # print("icm testing reward : mean {} , std {} , division {} ".format(rffs_mean , rffs_std , ((rffs_mean + rffs_std)/2 ) ) )

        # print("ICM testing rewards " , icm_testing_rewards)

        # icm_testing_rewards[icm_testing_rewards > rffs_mean] = 0.5
        # icm_testing_rewards[icm_testing_rewards < rffs_mean] = 0
        # icm_testing_rewards[icm_testing_rewards < rffs_mean] = 0
        # print("icm rewards ", icm_testing_rewards)

        # mb_rewards = icm_testing_rewards + mb_rewards

        # print( mb_rewards)
        # mb_rewards = mb_rewards[mb_rewards > 1]
        # mb_rewards = [1 if mb_rewards[mb_rewards >1 ] else 1]
        # mb_rewards[mb_rewards > 1] = 1
        # mask = mb_rewards[((icm_testing_rewards + mb_rewards ) % 2) == 0]

        # print("Mask ",mask)
        # mb_rewards[mask == 0] = 1

        # print("Mb reward ",mb_rewards )

        # print("Icm Rewards : ",icm_testing_rewards)
        # self.rff_rms.update_from_moments(rffs_mean, rffs_std ** 2, rffs_count)
        # rews = mb_rewards / np.sqrt(self.rff_rms.var)
        # >

        # print("update : rffs_mean {} , rffs_std {} , rffs_count {} ".format(
        # np.shape(rffs_mean),np.shape(rffs_std),np.shape(rffs_count)))

        # print(" update :  final rews {} rff_rms.var {} ".format(
        # rews , np.shape(self.rff_rms.var)))

        # print(">> the shape of rffs testing ", np.shape(rffs))

        # mb_rewards_copy = mb_rewards

        if self.curiosity == True:
            if self.gamma > 0.0:
                # Discount/bootstrap off value fn
                last_values = self.model.value(self.obs,
                                               S=self.states,
                                               M=self.dones).tolist()
                for n, (rewards, dones, value) in enumerate(
                        zip(mb_rewards, mb_dones, last_values)):
                    rewards = rewards.tolist()
                    dones = dones.tolist()
                    # if dones[-1] == 0:
                    rewards = discount_with_dones(rewards + [value],
                                                  dones + [0], self.gamma)[:-1]
                    # else:
                    # rewards = discount_with_dones(rewards, dones, self.gamma)

                    mb_rewards[n] = rewards
        else:
            # print(" Before discount_with_dones ")
            # print("Rewards " , mb_rewards)

            # print("Before rewards and values ")
            # print("Reward {} values {} ".format(mb_rewards , mb_values))
            if self.gamma > 0.0:
                # Discount/bootstrap off value fn
                last_values = self.model.value(self.obs,
                                               S=self.states,
                                               M=self.dones).tolist()
                for n, (rewards, dones, value) in enumerate(
                        zip(mb_rewards, mb_dones, last_values)):
                    rewards = rewards.tolist()
                    dones = dones.tolist()
                    if dones[-1] == 0:
                        rewards = discount_with_dones(rewards + [value],
                                                      dones + [0],
                                                      self.gamma)[:-1]
                    else:
                        rewards = discount_with_dones(rewards, dones,
                                                      self.gamma)

                    mb_rewards[n] = rewards

        # print(" After discount_with_dones ")
        # print("Orgnal discounterd Rewards " , np.shape(mb_rewards))

        # rffs_mean, rffs_std, rffs_count = mpi_moments(mb_rewards.ravel())
        # self.rff_rms.update_from_moments(rffs_mean, rffs_std ** 2, rffs_count)
        # mb_rewards = mb_rewards_copy / np.sqrt(self.rff_rms.var)

        mb_actions = mb_actions.reshape(self.batch_action_shape)

        mb_rewards = mb_rewards.flatten()
        mb_values = mb_values.flatten()
        mb_masks = mb_masks.flatten()

        if self.curiosity == True:
            mb_rews_icm = rews.flatten()

        # mb_new_updated_reward = mb_rews_icm + mb_rewards

        # print("New udpated rewards ",mb_new_updated_reward)

        # rffs_mean, rffs_std, rffs_count = mpi_moments(mb_new_updated_reward.ravel())
        # self.rff_rms.update_from_moments(rffs_mean, rffs_std ** 2, rffs_count)
        # rews = mb_new_updated_reward / np.sqrt(self.rff_rms.var)

        # print("After normalized",rews)

        # mb_new_rew = rews.flatten()

        # print("Flatten rewards and values ")
        # print("Reward {} ".format(mb_rewards ))

        # print("Merged things after obs {} rewards {} actions {} masks {}".
        # format(np.shape(mb_obs) , np.shape(mb_rewards) , np.shape(mb_actions) , np.shape(mb_masks)))

        return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values, mb_next_states  # , mb_rews_icm, mb_new_updated_reward #, mb_new_rew
Esempio n. 18
0
    def run(self):
        mb_obs, mb_options, mb_rewards, mb_actions, mb_values, mb_dones, mb_costs = [],[],[],[],[],[],[]

        for n in range(self.nsteps):
            actions, values = self.model.step(self.obs, self.options)

            mb_obs.append(np.copy(self.obs))
            mb_options.append(np.copy(self.options))
            mb_actions.append(actions)
            mb_values.append(values)
            mb_dones.append(self.dones)

            obs, rewards, dones, _ = self.env.step(actions)

            self.dones = dones

            for n, done in enumerate(dones):
                if done:
                    self.obs[n] = self.obs[n] * 0

            self.update_obs(obs)

            # Update current option
            self.options, costs = self.model.update_options(
                self.obs, self.options, self.option_eps, self.delib_cost)
            mb_costs.append(costs)

            mb_rewards.append(rewards)

        mb_dones.append(self.dones)

        #batch of steps to batch of rollouts
        mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(
            self.batch_ob_shape)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
        mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
        mb_options = np.asarray(mb_options, dtype=np.int32).swapaxes(1, 0)
        mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
        mb_costs = np.asarray(mb_costs, dtype=np.float32).swapaxes(1, 0)

        mb_dones = mb_dones[:, 1:]
        last_values = self.model.value(self.obs).tolist()

        #discount/bootstrap off value fn
        for n, (rewards, dones,
                value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
            rewards = rewards.tolist()
            dones = dones.tolist()

            if dones[-1] == 0:
                rewards = discount_with_dones(rewards + [value[0]],
                                              dones + [0], self.gamma)[:-1]
            else:
                rewards = discount_with_dones(rewards, dones, self.gamma)

            mb_rewards[n] = rewards

        mb_rewards = mb_rewards.flatten()
        mb_actions = mb_actions.flatten()
        mb_options = mb_options.flatten()
        mb_values = mb_values.flatten()
        mb_costs = mb_costs.flatten()

        return mb_obs, mb_options, mb_rewards, mb_actions, mb_values, mb_costs
Esempio n. 19
0
  def run(self):
    mb_obs, mb_td_targets, mb_base_actions, \
    mb_xy0, mb_xy1, \
    mb_values, mb_dones \
      = [], [], [], [], [], [], []

    mb_states = self.states
    for n in range(self.nsteps):
      # pi, pi2, x1, y1, x2, y2, v0
      pi1, pi_xy0, pi_xy1, values, states = self.model.step(
          self.obs, self.states, self.dones)

      pi1_noise = np.random.random_sample((self.nenv, 3)) * 0.3

      base_actions = np.argmax(
          pi1 * self.base_act_mask + pi1_noise, axis=1)
      xy0 = np.argmax(pi_xy0, axis=1)

      x0 = (xy0 % 32).astype(int)
      y0 = (xy0 / 32).astype(int)

      xy1 = np.argmax(pi_xy1, axis=1)
      x1 = (xy1 % 32).astype(int)
      y1 = (xy1 / 32).astype(int)

      # Scripted Agent Hacking

      for env_num in range(self.nenv):
        if env_num >= self.nscripts:  # only for scripted agents
          continue

        ob = self.obs[env_num, :, :, :]
        player_relative = ob[:, :, -1]

        self.group_list[env_num] = common.update_group_list2(
            self.control_groups[env_num])

        if len(self.action_queue[env_num]) == 0:

          self.action_queue[env_num], self.group_id[env_num], self.dest_per_marine[env_num], self.xy_per_marine[env_num] = \
            common.solve_tsp(player_relative,
                             self.selected[env_num][0],
                             self.group_list[env_num],
                             self.group_id[env_num],
                             self.dest_per_marine[env_num],
                             self.xy_per_marine[env_num])

        base_actions[env_num] = 0
        x0[env_num] = 0
        y0[env_num] = 0
        x1[env_num] = 0
        y1[env_num] = 0

        if len(self.action_queue[env_num]) > 0:
          action = self.action_queue[env_num].pop(0)
          base_actions[env_num] = action.get("base_action", 0)

          x0[env_num] = action.get("x0", 0)
          y0[env_num] = action.get("y0", 0)
          xy0[env_num] = y0[env_num] * 32 + x0[env_num]

          x1[env_num] = action.get("x1", 0)
          y1[env_num] = action.get("y1", 0)
          xy1[env_num] = y1[env_num] * 32 + x1[env_num]

      base_actions = self.valid_base_action(base_actions)
      new_base_actions = self.trans_base_actions(base_actions)

      base_action_spec = self.env.action_spec(new_base_actions)
      # print("base_actions:", base_actions)
      actions = self.construct_action(
          base_actions,
          base_action_spec,
          x0,
          y0,
          x1,
          y1
      )

      mb_obs.append(np.copy(self.obs))
      mb_base_actions.append(base_actions)

      mb_xy0.append(xy0)
      mb_xy1.append(xy1)
      mb_values.append(values)
      mb_dones.append(self.dones)

      #print("final acitons : ", actions)
      obs, rewards, dones,\
      available_actions, army_counts,\
      control_groups, selected, xy_per_marine\
      = self.env.step(
          actions=actions)
      self.army_counts = army_counts
      self.control_groups = control_groups
      self.selected = selected
      for env_num, data in enumerate(xy_per_marine):
        self.xy_per_marine[env_num] = data
      self.update_available(available_actions)

      self.states = states
      self.dones = dones
      mean_100ep_reward_a2c = 0
      for n, done in enumerate(dones):
        self.total_reward[n] += float(rewards[n])
        if done:
          self.obs[n] = self.obs[n] * 0
          self.episodes += 1
          num_episodes = self.episodes
          self.episode_rewards.append(self.total_reward[n])

          model = self.model
          mean_100ep_reward = round(
              np.mean(self.episode_rewards[-101:]), 1)
          if (n < self.nscripts):  # scripted agents
            self.episode_rewards_script.append(
                self.total_reward[n])
            mean_100ep_reward_script = round(
                np.mean(self.episode_rewards_script[-101:]), 1)
            nsml.report(
                reward_script=self.total_reward[n],
                mean_reward_script=mean_100ep_reward_script,
                reward=self.total_reward[n],
                mean_100ep_reward=mean_100ep_reward,
                episodes=self.episodes,
                step=self.episodes,
                scope=locals()
            )
          else:
            self.episode_rewards_a2c.append(self.total_reward[n])
            mean_100ep_reward_a2c = round(
                np.mean(self.episode_rewards_a2c[-101:]), 1)
            nsml.report(
                reward_a2c=self.total_reward[n],
                mean_reward_a2c=mean_100ep_reward_a2c,
                reward=self.total_reward[n],
                mean_100ep_reward=mean_100ep_reward,
                episodes=self.episodes,
                step=self.episodes,
                scope=locals()
            )
            print("mean_100ep_reward_a2c", mean_100ep_reward_a2c)

          if self.callback is not None:
            self.callback(locals(), globals())
          self.total_reward[n] = 0
          self.group_list[n] = []


      self.update_obs(obs)
      mb_td_targets.append(rewards)
    mb_dones.append(self.dones)
    #batch of steps to batch of rollouts
    mb_obs = np.asarray(
        mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(
            self.batch_ob_shape)
    mb_td_targets = np.asarray(mb_td_targets, dtype=np.float32).swapaxes(1, 0)
    mb_base_actions = np.asarray(
        mb_base_actions, dtype=np.int32).swapaxes(1, 0)

    mb_xy0 = np.asarray(mb_xy0, dtype=np.int32).swapaxes(1, 0)
    mb_xy1 = np.asarray(mb_xy1, dtype=np.int32).swapaxes(1, 0)

    mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
    mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
    mb_masks = mb_dones[:, :-1]
    mb_dones = mb_dones[:, 1:]
    last_values = self.model.value(self.obs, self.states,
                                   self.dones).tolist()
    #discount/bootstrap off value fn
    for n, (rewards, dones, value) in enumerate(
        zip(mb_td_targets, mb_dones, last_values)):
      rewards = rewards.tolist()
      dones = dones.tolist()
      if dones[-1] == 0:
        rewards = discount_with_dones(rewards + [value], dones + [0],
                                      self.gamma)[:-1]
      else:
        rewards = discount_with_dones(rewards, dones, self.gamma)
      mb_td_targets[n] = rewards
    mb_td_targets = mb_td_targets.flatten()
    mb_base_actions = mb_base_actions.flatten()
    mb_xy0 = mb_xy0.flatten()
    mb_xy1 = mb_xy1.flatten()

    mb_values = mb_values.flatten()
    mb_masks = mb_masks.flatten()
    return mb_obs, mb_states, mb_td_targets, mb_masks, \
           mb_base_actions, mb_xy0, mb_xy1, mb_values
Esempio n. 20
0
####################### modify the dataframe dimension ########################
    
    if (sample_time-1) % 10 == 0:
        IsPlot = True
    else:
        IsPlot = False
    
    if (sample_time % train_freq == 0):
        states = np.vstack(states)
        actions_idx = np.vstack(actions_idx)
        actions = np.array(actions)
        
        rewards_tmp = rewards.copy()
        last_value = expected_sarsa(model,last_state,K,C,action_low,action_high,False,random_choose,num=100)
        rewards_tmp.append(last_value)
        Q_target = discount_with_dones(rewards_tmp, dones+[last_done], gamma)
        Q_target = np.float32(np.vstack(Q_target))[:-1]
        
        R_buffer_sample = replay_buffer.sample(np.min([minibatch,timestep]))
        next_states_sampled = np.squeeze(R_buffer_sample[3], axis=1)
        dones_sampled = R_buffer_sample[4]
        reward_sampled = R_buffer_sample[2]
        
        last_v = [expected_sarsa(model,np.reshape(state_tmp,(1,-1)),K,C,action_low,action_high,True,random_choose,num=100) for state_tmp in next_states_sampled]
        last_v = np.vstack(last_v)
        Q_target_hist = reward_sampled + last_v * (1-dones_sampled) * gamma
        
        states_sampled1 = np.squeeze(R_buffer_sample[0], axis=1)
        states_sampled2 = states
        states_sampled = np.concatenate((states_sampled1,states_sampled2), axis = 0)
        actions_sampled1 = R_buffer_sample[1]
Esempio n. 21
0
 def run(self):
     mb_obs, mb_r_ex, mb_r_in, mb_ac, mb_v_ex, mb_v_mix, mb_dones = [],[],[],[],[],[],[]
     mb_policy_states = []
     ep_info, ep_r_ex, ep_r_in, ep_len = [], [], [], []
     for n in range(self.nsteps):
         mb_policy_states.append(self.policy_states)
         ac, v_ex, v_mix, policy_states, _ = self.model.step(
             self.obs, self.policy_states, self.dones)
         mb_obs.append(np.copy(self.obs))
         mb_ac.append(ac)
         mb_v_ex.append(v_ex)
         mb_v_mix.append(v_mix)
         mb_dones.append(self.dones)
         obs, r_ex, dones, infos = self.env.step(ac)
         r_in = self.model.intrinsic_reward(self.obs, ac)
         mb_r_ex.append(r_ex)
         mb_r_in.append(r_in)
         self.policy_states = policy_states
         self.dones = dones
         self.ep_r_ex += r_ex
         self.ep_r_in += r_in
         self.ep_len += 1
         for info in infos:
             maybeepinfo = info.get('episode')
             if maybeepinfo:
                 ep_info.append(maybeepinfo)
         for n, done in enumerate(dones):
             if done:
                 self.obs[n] = self.obs[n] * 0
                 ep_r_ex.append(self.ep_r_ex[n])
                 ep_r_in.append(self.ep_r_in[n])
                 ep_len.append(self.ep_len[n])
                 self.ep_r_ex[n], self.ep_r_in[n], self.ep_len[n] = 0, 0, 0
         self.obs = obs
     mb_dones.append(self.dones)
     #batch of steps to batch of rollouts
     mb_obs = np.asarray(mb_obs, dtype=np.float32).swapaxes(1, 0).reshape(
         self.batch_ob_shape)
     mb_r_ex = np.asarray(mb_r_ex, dtype=np.float32).swapaxes(1, 0)
     mb_r_in = np.asarray(mb_r_in, dtype=np.float32).swapaxes(1, 0)
     mb_r_mix = self.r_ex_coef * mb_r_ex + self.r_in_coef * mb_r_in
     mb_ac = np.asarray(mb_ac, dtype=np.int32).swapaxes(1, 0)
     mb_v_ex = np.asarray(mb_v_ex, dtype=np.float32).swapaxes(1, 0)
     mb_v_mix = np.asarray(mb_v_mix, dtype=np.float32).swapaxes(1, 0)
     mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
     mb_masks = mb_dones[:, :-1]
     mb_dones = mb_dones[:, 1:]
     last_v_ex, last_v_mix = self.model.value(self.obs, self.policy_states,
                                              self.dones)
     last_v_ex, last_v_mix = last_v_ex.tolist(), last_v_mix.tolist()
     #discount/bootstrap off value fn
     mb_ret_ex, mb_ret_mix = np.zeros(mb_r_ex.shape), np.zeros(
         mb_r_mix.shape)
     for n, (r_ex, r_mix, dones, v_ex, v_mix) in enumerate(
             zip(mb_r_ex, mb_r_mix, mb_dones, last_v_ex, last_v_mix)):
         r_ex, r_mix = r_ex.tolist(), r_mix.tolist()
         dones = dones.tolist()
         if dones[-1] == 0:
             ret_ex = discount_with_dones(r_ex + [v_ex], dones + [0],
                                          self.gamma)[:-1]
             ret_mix = discount_with_dones(r_mix + [v_mix], dones + [0],
                                           self.gamma)[:-1]
         else:
             ret_ex = discount_with_dones(r_ex, dones, self.gamma)
             ret_mix = discount_with_dones(r_mix, dones, self.gamma)
         mb_ret_ex[n], mb_ret_mix[n] = ret_ex, ret_mix
     mb_r_ex = mb_r_ex.flatten()
     mb_r_in = mb_r_in.flatten()
     mb_ret_ex = mb_ret_ex.flatten()
     mb_ret_mix = mb_ret_mix.flatten()
     mb_ac = mb_ac.flatten()
     mb_v_ex = mb_v_ex.flatten()
     mb_v_mix = mb_v_mix.flatten()
     mb_masks = mb_masks.flatten()
     mb_dones = mb_dones.flatten()
     return mb_obs, mb_ac, mb_policy_states, mb_r_in, mb_r_ex, mb_ret_ex, mb_ret_mix,\
            mb_v_ex, mb_v_mix, last_v_ex, last_v_mix, mb_masks, mb_dones,\
            ep_info, ep_r_ex, ep_r_in, ep_len
Esempio n. 22
0
    def run(self):
        self.tot_rewards = []
        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
        mb_rs, mb_rr = [], []
        for n in range(self.nsteps):
            actions, values, rs, rr = self.model.act(self.obs)

            actions = np.array(actions)
            values = np.array(values)
            mb_rs.append(rs)
            mb_rr.append(rr)

            mb_obs.append(np.copy(self.obs))
            mb_actions.append(actions)
            mb_values.append(values)
            mb_dones.append(self.dones)
            obs, rewards, dones, _ = self.env.step(actions)

            self.ep_rewards += rewards

            self.dones = dones
            for n, done in enumerate(dones):
                if done:
                    self.tot_rewards.append(self.ep_rewards[n])
                    self.ep_rewards[n] = 0
                    self.obs[n] = self.obs[n] * 0
            self.obs = obs
            mb_rewards.append(rewards)
        mb_dones.append(self.dones)

        #batch of steps to batch of rollouts
        mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(
            self.batch_ob_shape)
        mb_rs = np.asarray(mb_rs, dtype=np.uint8).swapaxes(1, 0).reshape(
            self.batch_rs_shape)
        mb_rr = np.asarray(mb_rr, dtype=np.uint8).swapaxes(1, 0).reshape(
            self.batch_rr_shape)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
        mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
        mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
        mb_masks = mb_dones[:, :-1]
        mb_dones = mb_dones[:, 1:]
        last_values = []
        for i in range(self.nenv):
            last_values.append(
                self.model.value(np.expand_dims(self.obs[i], axis=0)).tolist())
        #discount/bootstrap off value fn
        for n, (rewards, dones,
                value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
            rewards = rewards.tolist()
            dones = dones.tolist()
            if dones[-1] == 0:
                rewards = discount_with_dones(rewards + [value], dones + [0],
                                              self.gamma)[:-1]
            else:
                rewards = discount_with_dones(rewards, dones, self.gamma)
            mb_rewards[n] = rewards
        mb_rewards = mb_rewards.flatten()
        mb_actions = mb_actions.flatten()
        mb_values = mb_values.flatten()
        mb_masks = mb_masks.flatten()

        ep_reward_means = np.mean(
            self.tot_rewards) if len(self.tot_rewards) > 0 else None
        return mb_obs, mb_rs, mb_rr, mb_rewards, mb_masks, mb_actions, mb_values, ep_reward_means
Esempio n. 23
0
    def run(self):
        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_decoded = [],[],[],[],[],[]
        mb_states = self.states
        if self.render:
            plt.ion()  ## Note this correction
            fig = plt.figure(figsize=(5,4))
            axes = []
            axes.append(fig.add_subplot(3,4,1))
            axes.append(fig.add_subplot(3,4,2))
            axes.append(fig.add_subplot(3,4,3))
            axes.append(fig.add_subplot(3,4,4))

            axes.append(fig.add_subplot(3,1,2))

            axes.append(fig.add_subplot(3,4,9))
            axes.append(fig.add_subplot(3,4,10))
            axes.append(fig.add_subplot(3,4,11))
            axes.append(fig.add_subplot(3,4,12))
        for n in range(self.nsteps):
            actions, values, decoded, encoded, states = self.model.step(self.obs, self.states, self.dones)
            if self.render:
                obs = self.obs[0]
                obs = np.swapaxes(obs, 0, 2)
                obs = np.swapaxes(obs, 1, 2)
                imgs = decoded[0]
                imgs = np.swapaxes(imgs, 0, 2)
                imgs = np.swapaxes(imgs, 1, 2)
                for i in range(len(imgs)):
                    ob = obs[i]
                    axes[i].clear()
                    axes[i].set_yticklabels([])
                    axes[i].imshow(ob, cmap='gray', interpolation='nearest', aspect='equal')
                    img = imgs[i]
                    axes[i+5].clear()
                    axes[i+5].set_yticklabels([])
                    axes[i+5].imshow(img, cmap='gray', interpolation='nearest', aspect='equal')
                axes[4].clear()
                axes[4].imshow(encoded[0].reshape(8,98), interpolation='nearest', aspect='equal')
                plt.show()
                plt.pause(0.000001)  # Note this correction
            mb_obs.append(np.copy(self.obs))
            mb_decoded.append(decoded)
            mb_actions.append(actions)
            mb_values.append(values)
            mb_dones.append(self.dones)
            obs, rewards, dones, _ = self.env.step(actions)
            self.states = states
            self.dones = dones
            for n, done in enumerate(dones):
                if done:
                    self.obs[n] = self.obs[n]*0
            self.update_obs(obs)
            mb_rewards.append(rewards)
        mb_dones.append(self.dones)
        #batch of steps to batch of rollouts
        mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape)
        mb_decoded = np.asarray(mb_decoded, dtype=np.float32).swapaxes(1, 0)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
        mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
        mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
        mb_masks = mb_dones[:, :-1]
        mb_dones = mb_dones[:, 1:]
        last_values = self.model.value(self.obs, self.states, self.dones).tolist()
        #discount/bootstrap off value fn
        for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
            rewards = rewards.tolist()
            dones = dones.tolist()
            if dones[-1] == 0:
                rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1]
            else:
                rewards = discount_with_dones(rewards, dones, self.gamma)
            mb_rewards[n] = rewards
        mb_rewards = mb_rewards.flatten()
        mb_actions = mb_actions.flatten()
        mb_values = mb_values.flatten()
        mb_decoded = mb_decoded.flatten()
        mb_masks = mb_masks.flatten()
        return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values, mb_decoded
Esempio n. 24
0
    def run(self):
        # reset env
        self.obs = np.zeros(self.obs.shape)
        obs = self.env.reset()
        self.update_obs(obs)

        # run env until all threads finish
        episode_over = [-1 for i in range(self.nenv)]
        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_u1, mb_u2 = [], [], [], [], [], [], []
        mb_states = self.states
        step = 0
        while not all([e >= 0 for e in episode_over]):
            actions, u1, u2, values, states = self.model.step(self.obs, self.states, self.dones)
            mb_obs.append(np.copy(self.obs))
            mb_actions.append(actions)
            mb_values.append(values)
            mb_dones.append(self.dones)
            mb_u1.append(u1)
            mb_u2.append(u2)
            obs, rewards, dones, _ = self.env.step(actions)
            self.states = states
            self.dones = dones
            for n, done in enumerate(dones):
                if done:
                    self.obs[n] = self.obs[n] * 0
                    if episode_over[n] == -1:
                        episode_over[n] = step
            self.update_obs(obs)
            mb_rewards.append(rewards)
            step += 1

        mb_dones.append(self.dones)
        # batch of steps to batch of rollouts
        mb_obs = np.asarray(mb_obs).swapaxes(1, 0)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
        mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
        mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
        mb_masks = mb_dones[:, :-1]
        mb_dones = mb_dones[:, 1:]
        mb_u1 = np.asarray(mb_u1, dtype=np.float32).swapaxes(1, 0)
        mb_u2 = np.asarray(mb_u2, dtype=np.float32).swapaxes(1, 0)
        # discount/bootstrap off value fn
        _obs, _rewards, _actions, _values, _masks, _u1, _u2 = [], [], [], [], [], [], []
        for n, (obs, rewards, actions, values, dones, masks, u1, u2) in enumerate(zip(mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_masks, mb_u1, mb_u2)):
            # pull out data
            rewards = rewards.tolist()
            self.rewards.append(sum(rewards))
            actions = actions.tolist()
            values = values.tolist()
            dones = dones.tolist()
            masks = masks.tolist()
            u1, u2 = u1.tolist(), u2.tolist()
            # get length of this episode
            episode_length = episode_over[n]+1
            # crop out only played experience
            obs = obs[:episode_length]
            rewards = rewards[:episode_length]
            actions = actions[:episode_length]
            values = values[:episode_length]
            dones = dones[:episode_length]
            u1 = u1[:episode_length]
            u2 = u2[:episode_length]
            assert dones[-1] == True
            masks = masks[:episode_length]
            # discount the rewards
            rewards = discount_with_dones(rewards, dones, self.gamma)
            _obs.extend(obs)
            _rewards.extend(rewards)
            _actions.extend(actions)
            _values.extend(values)
            _masks.extend(masks)
            _u1.extend(u1)
            _u2.extend(u2)
        self.rewards = self.rewards[-100:]
        # make numpy
        mb_obs = np.asarray(_obs)
        mb_rewards = np.asarray(_rewards)
        mb_actions = np.asarray(_actions)
        mb_values = np.asarray(_values)
        mb_masks = np.asarray(_masks)
        mb_u1 = np.asarray(_u1)
        mb_u2 = np.asarray(_u2)
        self._num_rollouts += 1
        self._num_steps += len(rewards) * 4 # FRAME STACK
        ave_r = np.mean(self.rewards)
        #print("Episode {}, Ave R {}".format(self._num_rollouts, ave_r))
        logger.record_tabular("ave_r", ave_r)
        logger.record_tabular("last_r", self.rewards[-1])
        logger.record_tabular("num_rollouts", self._num_rollouts)
        logger.record_tabular("l", len(rewards) * 4)
        #logger.dump_tabular()
        END = False
        #print(self._num_steps, len(rewards))
        #if self._num_steps > 5000000:
        if np.mean(self.rewards) >= 195.:#195.:
            #if self._num_rollouts > 1000:
            logger.record_tabular("finished_in", self._num_rollouts)
            logger.record_tabular("total_steps", self._num_steps)
            logger.dump_tabular()
            END = True
        return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values, mb_u1, mb_u2, END
Esempio n. 25
0
    def run(self):
        if hasattr(self.model, 'step_env_random'):
            step_env_random = self.model.step_env_random
            sample_normal_op = tf.truncated_normal(shape=[7, 7, 1])
            new_normal_placeholder = tf.placeholder(
                shape=[self.nenvs, 7, 7, 1], dtype=tf.float32)
            assign_normal_op = tf.assign(ref=step_env_random,
                                         value=new_normal_placeholder)


        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
        mb_states = self.states
        dropout_strength_tup = (self.model.DROPOUT_STRENGTH,
                                get_dropout_strength(
                                    self.hparams,
                                    self.model.lr.n + self.nbatch))

        for n in range(self.nsteps):
            actions, values, states = self.model.step(
                self.obs,
                self.states,
                self.dones,
                _dropout_strength=dropout_strength_tup)

            if hasattr(self.model, 'target_model'):
                values = self.model.target_model.value(
                    self.obs,
                    self.states,
                    self.dones,
                    _dropout_strength=dropout_strength_tup).tolist()

            mb_obs.append(np.copy(self.obs))
            mb_actions.append(actions)
            mb_values.append(values)
            mb_dones.append(self.dones)
            obs, rewards, dones, _ = self.env.step(actions)
            self.states = states
            self.dones = dones
            for n, done in enumerate(dones):
                if done:
                    self.obs[n] = self.obs[n] * 0

                    if hasattr(self.model, 'step_env_random'):
                        [cur_rand, new_rand] = self.model.sess.run(
                            [step_env_random, sample_normal_op])
                        cur_rand[n] = new_rand
                        self.model.sess.run(
                            assign_normal_op,
                            feed_dict={new_normal_placeholder: cur_rand})

            self.obs = obs
            mb_rewards.append(rewards)
        mb_dones.append(self.dones)
        #batch of steps to batch of rollouts
        mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(
            self.batch_ob_shape)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
        mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
        mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
        mb_masks = mb_dones[:, :-1]
        mb_dones = mb_dones[:, 1:]

        if hasattr(self.model, 'target_model'):
            last_values = self.model.target_model.value(
                self.obs,
                self.states,
                self.dones,
                _dropout_strength=dropout_strength_tup).tolist()
        else:
            last_values = self.model.value(
                self.obs,
                self.states,
                self.dones,
                _dropout_strength=dropout_strength_tup).tolist()
        #discount/bootstrap off value fn
        for n, (rewards, dones,
                value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
            rewards = rewards.tolist()
            dones = dones.tolist()
            if dones[-1] == 0:
                rewards = discount_with_dones(rewards + [value], dones + [0],
                                              self.gamma)[:-1]
            else:
                rewards = discount_with_dones(rewards, dones, self.gamma)
            mb_rewards[n] = rewards
        mb_rewards = mb_rewards.flatten()
        mb_actions = mb_actions.flatten()
        mb_values = mb_values.flatten()
        mb_masks = mb_masks.flatten()
        return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
Esempio n. 26
0
    def run(self):
        # We initialize the lists that will contain the mb of experiences
        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
        mb_states = self.states
        stepdict = {"S": self.states, "M": self.dones}
        if self.dropoutpi != 1.0:
            stepdict["dropoutpi_keep_prob"] = 1.0
        if self.dropoutvf != 1.0:
            stepdict["dropoutvf_keep_prob"] = 1.0
        if self.isbnpitrainmode != None:
            stepdict["isbnpitrainmode"] = False
        if self.isbnvftrainmode != None:
            stepdict["isbnvftrainmode"] = False
        for n in range(self.nsteps):
            # Given observations, take action and value (V(s))

            actions, values, states, _ = self.model.step(self.obs, **stepdict)

            # Append the experiences
            mb_obs.append(np.copy(self.obs))
            mb_actions.append(actions)
            mb_values.append(values)
            mb_dones.append(self.dones)

            # Take actions in env and look the results
            obs, rewards, dones, _ = self.env.step(actions)
            self.states = states
            self.dones = dones
            self.obs = obs
            mb_rewards.append(rewards)
        mb_dones.append(self.dones)
        # Batch of steps to batch of rollouts
        mb_obs = np.asarray(mb_obs, dtype=self.ob_dtype).swapaxes(
            1, 0).reshape(self.batch_ob_shape)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
        mb_actions = np.asarray(
            mb_actions,
            dtype=self.model.train_model.action.dtype.name).swapaxes(1, 0)
        mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
        mb_masks = mb_dones[:, :-1]
        mb_dones = mb_dones[:, 1:]

        if self.gamma > 0.0:
            # Discount/bootstrap off value fn
            last_values = self.model.value(self.obs, **stepdict).tolist()

            for n, (rewards, dones,
                    value) in enumerate(zip(mb_rewards, mb_dones,
                                            last_values)):
                rewards = rewards.tolist()
                dones = dones.tolist()
                if dones[-1] == 0:
                    rewards = discount_with_dones(rewards + [value],
                                                  dones + [0], self.gamma)[:-1]
                else:
                    rewards = discount_with_dones(rewards, dones, self.gamma)

                mb_rewards[n] = rewards

        mb_actions = mb_actions.reshape(self.batch_action_shape)

        mb_rewards = mb_rewards.flatten()
        mb_values = mb_values.flatten()
        mb_masks = mb_masks.flatten()
        return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
Esempio n. 27
0
    def run(self):
        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_chosen_probs = [],[],[],[],[],[]  # Minibatch = mb
        mb_states = self.states
        for n in range(self.nsteps):
            actions, values, aprobs, states = self.model.step(
                self.obs, self.states, self.dones)
            mb_chosen_probs.append(aprobs[range(len(actions)), actions])
            mb_obs.append(np.copy(self.obs))
            mb_actions.append(actions)
            mb_values.append(values)
            mb_dones.append(self.dones)
            obs, rewards, dones, _ = self.env.step(actions)
            self.states = states
            self.dones = dones
            for n, done in enumerate(dones):
                if done:
                    self.obs[n] = self.obs[n] * 0
            self.update_obs(obs)
            mb_rewards.append(rewards)

        mb_dones.append(self.dones)
        #batch of steps to batch of rollouts
        mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(
            self.batch_ob_shape)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
        mb_chosen_probs = np.asarray(mb_chosen_probs,
                                     dtype=np.float32).swapaxes(1, 0)
        mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
        mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
        mb_masks = mb_dones[:, :-1]
        mb_dones = mb_dones[:, 1:]
        last_values = self.model.value(self.obs, self.states,
                                       self.dones).tolist()
        #discount/bootstrap off value fn
        for n, (rewards, dones,
                value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
            rewards = rewards.tolist()
            dones = dones.tolist()
            if dones[-1] == 0:
                rewards = discount_with_dones(rewards + [value], dones + [0],
                                              self.gamma)[:-1]
            else:
                rewards = discount_with_dones(rewards, dones, self.gamma)
            mb_rewards[n] = rewards
        mb_rewards = mb_rewards.flatten()
        mb_chosen_probs = mb_chosen_probs.flatten()
        mb_actions = mb_actions.flatten()
        mb_values = mb_values.flatten()
        mb_masks = mb_masks.flatten()

        mb_advs = mb_rewards - mb_values
        corrected_advs = []
        if 'MIS_ADV' in os.environ:
            for adv_i, adv in enumerate(mb_advs):
                if adv < 0:
                    adv_scale = 1. + mb_chosen_probs[adv_i] / (
                        1. - mb_chosen_probs[adv_i])
                    # adv_scale = 1.0  # Disable with minimal code change
                    # adv_scale = min(adv_scale, 5.0)
                    corrected_advs.append(adv * adv_scale)
                else:
                    corrected_advs.append(adv)
            mb_advs = np.array(corrected_advs)

        return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values, mb_advs
Esempio n. 28
0
  def run(self):
    mb_obs, mb_td_targets, mb_base_actions, \
    mb_xy0, mb_xy1, \
    mb_values, mb_dones \
      = [], [], [], [], [], [], []

    mb_states = self.states
    for n in range(self.nsteps):
      # pi, pi2, x1, y1, x2, y2, v0
      pi1, pi_xy0, pi_xy1, values, states = self.model.step(
          self.obs, self.states, self.dones)

      pi1_noise = np.random.random_sample((self.nenv, 3)) * 0.3

      base_actions = np.argmax(
          pi1 * self.base_act_mask + pi1_noise, axis=1)
      xy0 = np.argmax(pi_xy0, axis=1)

      x0 = (xy0 % 32).astype(int)
      y0 = (xy0 / 32).astype(int)

      xy1 = np.argmax(pi_xy1, axis=1)
      x1 = (xy1 % 32).astype(int)
      y1 = (xy1 / 32).astype(int)

      # Scripted Agent Hacking

      for env_num in range(self.nenv):
        if env_num >= self.nscripts:  # only for scripted agents
          continue

        ob = self.obs[env_num, :, :, :]
        player_relative = ob[:, :, -1]

        self.group_list[env_num] = common.update_group_list2(
            self.control_groups[env_num])

        if len(self.action_queue[env_num]) == 0:

          self.action_queue[env_num], self.group_id[env_num], self.dest_per_marine[env_num], self.xy_per_marine[env_num] = \
            common.solve_tsp(player_relative,
                             self.selected[env_num][0],
                             self.group_list[env_num],
                             self.group_id[env_num],
                             self.dest_per_marine[env_num],
                             self.xy_per_marine[env_num])

        base_actions[env_num] = 0
        x0[env_num] = 0
        y0[env_num] = 0
        x1[env_num] = 0
        y1[env_num] = 0

        if len(self.action_queue[env_num]) > 0:
          action = self.action_queue[env_num].pop(0)
          base_actions[env_num] = action.get("base_action", 0)

          x0[env_num] = action.get("x0", 0)
          y0[env_num] = action.get("y0", 0)
          xy0[env_num] = y0[env_num] * 32 + x0[env_num]

          x1[env_num] = action.get("x1", 0)
          y1[env_num] = action.get("y1", 0)
          xy1[env_num] = y1[env_num] * 32 + x1[env_num]

      base_actions = self.valid_base_action(base_actions)
      new_base_actions = self.trans_base_actions(base_actions)

      base_action_spec = self.env.action_spec(new_base_actions)
      # print("base_actions:", base_actions)
      actions = self.construct_action(
          base_actions,
          base_action_spec,
          x0,
          y0,
          x1,
          y1
      )

      mb_obs.append(np.copy(self.obs))
      mb_base_actions.append(base_actions)

      mb_xy0.append(xy0)
      mb_xy1.append(xy1)
      mb_values.append(values)
      mb_dones.append(self.dones)

      #print("final acitons : ", actions)
      obs, rewards, dones,\
      available_actions, army_counts,\
      control_groups, selected, xy_per_marine\
      = self.env.step(
          actions=actions)
      self.army_counts = army_counts
      self.control_groups = control_groups
      self.selected = selected
      for env_num, data in enumerate(xy_per_marine):
        self.xy_per_marine[env_num] = data
      self.update_available(available_actions)

      self.states = states
      self.dones = dones
      mean_100ep_reward_a2c = 0
      for n, done in enumerate(dones):
        self.total_reward[n] += float(rewards[n])
        if done:
          self.obs[n] = self.obs[n] * 0
          self.episodes += 1
          num_episodes = self.episodes
          self.episode_rewards.append(self.total_reward[n])

          model = self.model
          mean_100ep_reward = round(
              np.mean(self.episode_rewards[-101:]), 1)
          if (n < self.nscripts):  # scripted agents
            self.episode_rewards_script.append(
                self.total_reward[n])
            mean_100ep_reward_script = round(
                np.mean(self.episode_rewards_script[-101:]), 1)
            nsml.report(
                reward_script=self.total_reward[n],
                mean_reward_script=mean_100ep_reward_script,
                reward=self.total_reward[n],
                mean_100ep_reward=mean_100ep_reward,
                episodes=self.episodes,
                step=self.episodes,
                scope=locals()
            )
          else:
            self.episode_rewards_a2c.append(self.total_reward[n])
            mean_100ep_reward_a2c = round(
                np.mean(self.episode_rewards_a2c[-101:]), 1)
            nsml.report(
                reward_a2c=self.total_reward[n],
                mean_reward_a2c=mean_100ep_reward_a2c,
                reward=self.total_reward[n],
                mean_100ep_reward=mean_100ep_reward,
                episodes=self.episodes,
                step=self.episodes,
                scope=locals()
            )
            print("mean_100ep_reward_a2c", mean_100ep_reward_a2c)

          if self.callback is not None:
            self.callback(locals(), globals())
          self.total_reward[n] = 0
          self.group_list[n] = []


      self.update_obs(obs)
      mb_td_targets.append(rewards)
    mb_dones.append(self.dones)
    #batch of steps to batch of rollouts
    mb_obs = np.asarray(
        mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(
            self.batch_ob_shape)
    mb_td_targets = np.asarray(mb_td_targets, dtype=np.float32).swapaxes(1, 0)
    mb_base_actions = np.asarray(
        mb_base_actions, dtype=np.int32).swapaxes(1, 0)

    mb_xy0 = np.asarray(mb_xy0, dtype=np.int32).swapaxes(1, 0)
    mb_xy1 = np.asarray(mb_xy1, dtype=np.int32).swapaxes(1, 0)

    mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
    mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
    mb_masks = mb_dones[:, :-1]
    mb_dones = mb_dones[:, 1:]
    last_values = self.model.value(self.obs, self.states,
                                   self.dones).tolist()
    #discount/bootstrap off value fn
    for n, (rewards, dones, value) in enumerate(
        zip(mb_td_targets, mb_dones, last_values)):
      rewards = rewards.tolist()
      dones = dones.tolist()
      if dones[-1] == 0:
        rewards = discount_with_dones(rewards + [value], dones + [0],
                                      self.gamma)[:-1]
      else:
        rewards = discount_with_dones(rewards, dones, self.gamma)
      mb_td_targets[n] = rewards
    mb_td_targets = mb_td_targets.flatten()
    mb_base_actions = mb_base_actions.flatten()
    mb_xy0 = mb_xy0.flatten()
    mb_xy1 = mb_xy1.flatten()

    mb_values = mb_values.flatten()
    mb_masks = mb_masks.flatten()
    return mb_obs, mb_states, mb_td_targets, mb_masks, \
           mb_base_actions, mb_xy0, mb_xy1, mb_values
Esempio n. 29
0
    def run(self, update):
        # We initialize the lists that will contain the mb of experiences
        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_pos, mb_nm, mb_nm_xy = [],[],[],[],[],[],[],[]
        epinfos = []
        for n in range(self.nsteps):
            # Given observations, take action and value (V(s))
            # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init

            # Prepare nm_xy
            for i in range(self.neural_map.shape[0]):
                if self.use_extended_write_op:
                    self.neural_map_xy[i, 0, :] = self.neural_map[
                        i,
                        int(self.pos[i, 1] // self.pos_y_divisor),
                        int(self.pos[i, 0] // self.pos_x_divisor), :]

                    if self.pos[i, 2] == 0:
                        self.neural_map_xy[i, 1, :] = self.neural_map[
                            i,
                            int(self.pos[i, 1] // self.pos_y_divisor) + 1,
                            int(self.pos[i, 0] // self.pos_x_divisor), :]
                    elif self.pos[i, 2] == 1:
                        self.neural_map_xy[i, 1, :] = self.neural_map[
                            i,
                            int(self.pos[i, 1] // self.pos_y_divisor),
                            int(self.pos[i, 0] // self.pos_x_divisor) + 1, :]
                    elif self.pos[i, 2] == 2:
                        self.neural_map_xy[i, 1, :] = self.neural_map[
                            i,
                            int(self.pos[i, 1] // self.pos_y_divisor) - 1,
                            int(self.pos[i, 0] // self.pos_x_divisor), :]
                    elif self.pos[i, 2] == 3:
                        self.neural_map_xy[i, 1, :] = self.neural_map[
                            i,
                            int(self.pos[i, 1] // self.pos_y_divisor),
                            int(self.pos[i, 0] // self.pos_x_divisor) - 1, :]

                else:
                    self.neural_map_xy[i, :] = self.neural_map[
                        i,
                        int(self.pos[i, 1] // self.pos_y_divisor),
                        int(self.pos[i, 0] // self.pos_x_divisor), :]

            actions, values, write_vector, _ = self.model.step(
                self.obs, S=self.neural_map, M=self.neural_map_xy)

            # Append the experiences
            mb_obs.append(np.copy(self.obs))
            mb_actions.append(actions)
            mb_values.append(values)
            mb_dones.append(self.dones)

            mb_pos.append(self.pos.copy())
            mb_nm.append(self.neural_map.copy())
            mb_nm_xy.append(self.neural_map_xy.copy())

            # Update neural map with write vector
            for i in range(self.neural_map.shape[0]):
                if self.use_extended_write_op:
                    self.neural_map[i,
                                    int(self.pos[i, 1] // self.pos_y_divisor),
                                    int(self.pos[i, 0] //
                                        self.pos_x_divisor), :] = write_vector[
                                            i, 0, :]

                    if self.pos[i, 2] == 0:
                        self.neural_map[
                            i,
                            int(self.pos[i, 1] // self.pos_y_divisor) + 1,
                            int(self.pos[i, 0] //
                                self.pos_x_divisor), :] = write_vector[i, 1, :]
                    elif self.pos[i, 2] == 1:
                        self.neural_map[i,
                                        int(self.pos[i, 1] //
                                            self.pos_y_divisor),
                                        int(self.pos[i, 0] //
                                            self.pos_x_divisor) +
                                        1, :] = write_vector[i, 1, :]
                    elif self.pos[i, 2] == 2:
                        self.neural_map[
                            i,
                            int(self.pos[i, 1] // self.pos_y_divisor) - 1,
                            int(self.pos[i, 0] //
                                self.pos_x_divisor), :] = write_vector[i, 1, :]
                    elif self.pos[i, 2] == 3:
                        self.neural_map[i,
                                        int(self.pos[i, 1] //
                                            self.pos_y_divisor),
                                        int(self.pos[i, 0] //
                                            self.pos_x_divisor) -
                                        1, :] = write_vector[i, 1, :]

                else:
                    self.neural_map[i,
                                    int(self.pos[i, 1] // self.pos_y_divisor),
                                    int(self.pos[i, 0] //
                                        self.pos_x_divisor), :] = write_vector[
                                            i, :]

            # Take actions in env and look the results
            # Infos contains a ton of useful informations
            tmp, rewards, dones, infos = self.env.step(actions)
            obs = tmp[:, :-3]
            self.pos = tmp[:, -3:]

            #if (update > 120000) and (update <= 125000):
            step_offset = (update - 1) * self.nsteps
            #ex.log_scalar('neural_map', mb_nm[-1].tolist(), step_offset+n)
            #ex.log_scalar('obs', mb_obs[-1].tolist(), step_offset+n)
            #ex.log_scalar('pos', mb_pos[-1].tolist(), step_offset+n)
            #ex.log_scalar('action', actions.tolist(), step_offset+n)
            #ex.log_scalar('reward', rewards.tolist(), step_offset+n)
            #ex.log_scalar('done', dones.tolist(), step_offset+n)

            #if 'goal_positions' in infos[0]:
            #ex.log_scalar('ep_goal_positions', infos[0]['goal_positions'], step_offset+n)

            if 'episode' in infos[0]:
                ex.log_scalar('ep_length', infos[0]['episode']['l'],
                              step_offset + n)
                ex.log_scalar('ep_reward', infos[0]['episode']['r'],
                              step_offset + n)

            for info in infos:
                maybeepinfo = info.get('episode')
                if maybeepinfo: epinfos.append(maybeepinfo)

            self.dones = dones
            self.obs = obs
            mb_rewards.append(rewards)
        mb_dones.append(self.dones)

        # Batch of steps to batch of rollouts
        mb_obs = np.asarray(mb_obs, dtype=self.ob_dtype).swapaxes(
            1, 0).reshape(self.batch_ob_shape)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
        mb_actions = np.asarray(
            mb_actions,
            dtype=self.model.train_model.action.dtype.name).swapaxes(1, 0)
        mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
        mb_masks = mb_dones[:, :-1]
        mb_dones = mb_dones[:, 1:]
        mb_pos = np.asarray(mb_pos, dtype=self.pos.dtype)
        mb_pos = mb_pos.swapaxes(0,
                                 1).reshape(mb_pos.shape[0] * mb_pos.shape[1],
                                            mb_pos.shape[2])
        mb_nm = np.asarray(mb_nm, dtype=self.neural_map.dtype)
        mb_nm = mb_nm.swapaxes(0, 1).reshape(mb_nm.shape[0] * mb_nm.shape[1],
                                             *mb_nm.shape[2:])
        mb_nm_xy = np.asarray(mb_nm_xy, dtype=self.neural_map_xy.dtype)
        mb_nm_xy = mb_nm_xy.swapaxes(0, 1).reshape(
            mb_nm_xy.shape[0] * mb_nm_xy.shape[1], *mb_nm_xy.shape[2:])
        if self.gamma > 0.0:
            # Discount/bootstrap off value fn

            # Prepare nm_xy
            for i in range(self.neural_map.shape[0]):
                if self.use_extended_write_op:
                    self.neural_map_xy[i, 0, :] = self.neural_map[
                        i,
                        int(self.pos[i, 1] // self.pos_y_divisor),
                        int(self.pos[i, 0] // self.pos_x_divisor), :]

                    if self.pos[i, 2] == 0:
                        self.neural_map_xy[i, 1, :] = self.neural_map[
                            i,
                            int(self.pos[i, 1] // self.pos_y_divisor) + 1,
                            int(self.pos[i, 0] // self.pos_x_divisor), :]
                    elif self.pos[i, 2] == 1:
                        self.neural_map_xy[i, 1, :] = self.neural_map[
                            i,
                            int(self.pos[i, 1] // self.pos_y_divisor),
                            int(self.pos[i, 0] // self.pos_x_divisor) + 1, :]
                    elif self.pos[i, 2] == 2:
                        self.neural_map_xy[i, 1, :] = self.neural_map[
                            i,
                            int(self.pos[i, 1] // self.pos_y_divisor) - 1,
                            int(self.pos[i, 0] // self.pos_x_divisor), :]
                    elif self.pos[i, 2] == 3:
                        self.neural_map_xy[i, 1, :] = self.neural_map[
                            i,
                            int(self.pos[i, 1] // self.pos_y_divisor),
                            int(self.pos[i, 0] // self.pos_x_divisor) - 1, :]

                else:
                    self.neural_map_xy[i, :] = self.neural_map[
                        i,
                        int(self.pos[i, 1] // self.pos_y_divisor),
                        int(self.pos[i, 0] // self.pos_x_divisor), :]

            last_values = self.model.value(self.obs,
                                           S=self.neural_map,
                                           M=self.neural_map_xy).tolist()

            for n, (rewards, dones,
                    value) in enumerate(zip(mb_rewards, mb_dones,
                                            last_values)):
                rewards = rewards.tolist()
                dones = dones.tolist()
                if dones[-1] == 0:
                    rewards = discount_with_dones(rewards + [value],
                                                  dones + [0], self.gamma)[:-1]
                else:
                    rewards = discount_with_dones(rewards, dones, self.gamma)

                mb_rewards[n] = rewards

        mb_actions = mb_actions.reshape(self.batch_action_shape)

        mb_rewards = mb_rewards.flatten()
        mb_values = mb_values.flatten()
        mb_masks = mb_masks.flatten()

        return mb_obs, mb_nm, mb_rewards, mb_nm_xy, mb_actions, mb_values, mb_pos, epinfos
Esempio n. 30
0
 def run(self):
     mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
     mb_states = self.states
     for n in range(self.nsteps):
         actions, values, states = self.model.step(
             self.obs, self.states, self.dones
         )  # Comments by Fei: step_model (nstep = 1)! nenv, nenv, nenv * 2nlstm
         mb_obs.append(
             np.copy(self.obs)
         )  # Comments by Fei: finally will be nsteps * nenv * nh * nw * (nc*nstack)
         mb_actions.append(
             actions)  # Comments by Fei: finally will be nsteps * nenv
         mb_values.append(
             values)  # Comments by Fei: finally will be nsteps * nenv
         mb_dones.append(self.dones)
         obs, rewards, dones, _ = self.env.step(
             actions)  # Comments by Fei: nenv * nh * nw * 1, nenv, nenv
         self.states = states
         self.dones = dones
         for n, done in enumerate(dones):
             if done:
                 self.obs[n] = self.obs[n] * 0
         self.update_obs(obs)
         mb_rewards.append(
             rewards)  # Comments by Fei: finally will be nsteps * nenv
     mb_dones.append(
         self.dones)  # Comments by Fei: finally will be (nsteps+1) * nenv
     #batch of steps to batch of rollouts
     mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(
         self.batch_ob_shape
     )  # Comments by Fei: (nenv*nsteps, nh, nw, nc*nstack)
     mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(
         1, 0)  # Comments by Fei: nenv * nsteps
     mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(
         1, 0)  # Comments by Fei: nenv * nsteps
     mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(
         1, 0)  # Comments by Fei: nenv * nsteps
     mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(
         1, 0)  # Comments by Fei: nenv * (nsteps+1)
     mb_masks = mb_dones[:, :
                         -1]  # Comments by Fei: masks is nenv * nsteps (missing the last done)
     mb_dones = mb_dones[:,
                         1:]  # Comments by Fei: dones is nenv * nsteps (missing the first done)
     last_values = self.model.value(
         self.obs, self.states, self.dones).tolist(
         )  # Comments by Fei: step_model (nstep = 1)! nenv vector
     #discount/bootstrap off value fn
     for n, (rewards, dones, value) in enumerate(
             zip(mb_rewards, mb_dones,
                 last_values)):  # Comments by Fei: nenv | nsteps, nsteps, 1
         rewards = rewards.tolist()
         dones = dones.tolist()
         if dones[-1] == 0:
             rewards = discount_with_dones(rewards + [value], dones + [0],
                                           self.gamma)[:-1]
         else:
             rewards = discount_with_dones(rewards, dones, self.gamma)
         mb_rewards[n] = rewards
     mb_rewards = mb_rewards.flatten()  # Comments by Fei: nbatch vector now
     mb_actions = mb_actions.flatten()  # Comments by Fei: nbatch vector now
     mb_values = mb_values.flatten()  # Comments by Fei: nbatch vector now
     mb_masks = mb_masks.flatten()  # Comments by Fei: nbatch vector now
     return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
Esempio n. 31
0
    def run(
            self,
            *,
            # EPOpt specific - could go in __init__ but epsilon is callable
            paths,
            epsilon):
        """Instead of doing a trajectory of nsteps (ie, "horizon"), do a
        sample N "paths" and then return the bottom epsilon-percentile
        """
        # FIXME(cpacker): currently only works with single-threading
        assert (self.env.num_envs == 1)

        # Store all N trajectories sampled then return data of bottom-epsilon
        # lists -> lists of lists
        n_mb_obs, n_mb_rewards, n_mb_actions, n_mb_values, n_mb_dones = [
            [] for _ in range(paths)
        ], [[] for _ in range(paths)], [[] for _ in range(paths)
                                        ], [[] for _ in range(paths)
                                            ], [[] for _ in range(paths)]
        num_episodes = 0
        mb_states = self.states
        for N in range(paths):

            mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = n_mb_obs[
                N], n_mb_rewards[N], n_mb_actions[N], n_mb_values[
                    N], n_mb_dones[N]
            for _ in range(self.env.venv.envs[0].spec.max_episode_steps):
                actions, values, states, _ = self.model.step(
                    self.obs, self.states, self.dones)
                mb_obs.append(np.copy(self.obs))
                mb_actions.append(actions)
                mb_values.append(values)
                mb_dones.append(self.dones)
                obs, rewards, dones, _ = self.env.step(actions)
                self.states = states
                self.dones = dones
                for i, done in enumerate(dones):
                    if done:
                        self.obs[i] = self.obs[i] * 0
                self.obs = obs
                mb_rewards.append(rewards)
                # We only want to do one episode
                if self.dones:
                    break
            mb_dones.append(self.dones)

        # Compute the worst epsilon paths and concatenate them
        episode_returns = [sum(r) for r in n_mb_rewards]
        cutoff = np.percentile(episode_returns, 100 * epsilon)

        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
        for N in range(paths):
            #if n_mb_rewards[N] <= cutoff:
            if episode_returns[N] <= cutoff:
                # only count the episodes that are returned
                num_episodes += 1
                # "cache" values to keep track of final ones
                next_obs = n_mb_obs[N]
                next_rewards = n_mb_rewards[N]
                next_actions = n_mb_actions[N]
                next_values = n_mb_values[N]
                next_dones = n_mb_dones[N]
                # concatenate
                mb_obs.extend(next_obs)
                mb_rewards.extend(next_rewards)
                mb_actions.extend(next_actions)
                mb_values.extend(next_values)
                # when constructing mb_dones, only append
                # next_dones[:-1] except for the last episode
                mb_dones.extend(next_dones[:-1])
        mb_dones.append(next_dones[-1])

        # batch of steps to batch of rollouts
        mb_obs = np.asarray(mb_obs, dtype=np.float32).swapaxes(1, 0).squeeze()
        #mb_obs = np.asarray(mb_obs, dtype=np.float32).swapaxes(1,0).reshape(self.batch_ob_shape)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
        if self.discrete:
            mb_actions = np.asarray(mb_actions, dtype=np.int).swapaxes(1, 0)
        else:
            mb_actions = np.asarray(mb_actions,
                                    dtype=np.float32).swapaxes(1, 0)
        mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
        mb_masks = mb_dones[:, :-1]
        mb_dones = mb_dones[:, 1:]

        # We can't just use self.obs etc, because the last of the N paths
        # may not be included in the update
        last_values = self.model.value(self.obs, self.states,
                                       self.dones).tolist()
        # last_values = self.model.value(next_obs[-1], n_last_states[-1], next_dones[-1]).tolist()

        # discount/bootstrap off value fn
        for n, (rewards, dones,
                value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
            rewards = rewards.tolist()
            dones = dones.tolist()
            if dones[-1] == 0:
                rewards = discount_with_dones(rewards + [value], dones + [0],
                                              self.gamma)[:-1]
            else:
                rewards = discount_with_dones(rewards, dones, self.gamma)
            mb_rewards[n] = rewards
        mb_rewards = mb_rewards.flatten()
        if self.discrete:
            mb_actions = mb_actions.reshape(mb_rewards.shape)
        else:
            mb_actions = mb_actions.reshape(
                (mb_rewards.shape[0], self.ac_space.shape[0]))
        mb_values = mb_values.flatten()
        mb_masks = mb_masks.flatten()
        return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values, num_episodes
Esempio n. 32
0
 def run(self):
     mb_obs, prev_rewards, mb_rewards, prev_actions, mb_actions, mb_values, mb_dones, mb_masks = [], [], [], [], [], [], [], []
     mb_states = self.states
     for n in range(self.nsteps):
         actions, values, states, _ = self.model.step(
             self.obs, self.states, self.actions, self.rewards, self.dones,
             self.masks)
         mb_obs.append(np.copy(self.obs))
         prev_actions.append(self.actions)
         mb_actions.append(actions)
         mb_values.append(values)
         prev_rewards.append(self.rewards)
         mb_masks.append(self.masks)
         mb_dones.append(self.dones)
         # if end_of_trial, if episode gets done in the next step, we need to reset environment parameters
         end_of_trial = [
             self.episode_in_trial[i] == (self.episodes_per_trial - 1)
             for i in range(self.nenv)
         ]
         obs, rewards, dones, _ = self.env.step(actions, end_of_trial)
         self.actions = actions
         self.states = states
         self.dones = dones
         self.masks = [False for _ in range(self.nenv)]
         self.rewards = rewards
         self.obs[:] = obs
         mb_rewards.append(rewards)
         for i, done in enumerate(self.dones):
             if done:
                 self.episode_in_trial[
                     i] += 1  # episode finished in the current step
                 self.episode_in_trial[i] %= self.episodes_per_trial
                 if self.episode_in_trial[i] == 0:
                     self.masks[i] = True
                     self.rewards[i] = 0.0
                     self.dones[i] = False
                     if self.discrete:
                         self.actions[i] = -1
                     else:
                         self.actions[i] = np.zeros(
                             (self.ac_space.shape[0]), dtype=np.float32)
     mb_masks.append(self.masks)  # nsteps+1 records
     # batch of steps to batch of rollouts
     mb_obs = np.asarray(mb_obs, dtype=np.float32).swapaxes(1, 0).reshape(
         self.batch_ob_shape)
     prev_rewards = np.asarray(prev_rewards,
                               dtype=np.float32).swapaxes(1, 0)
     mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
     if self.discrete:
         prev_actions = np.asarray(prev_actions,
                                   dtype=np.int).swapaxes(1, 0)
         mb_actions = np.asarray(mb_actions, dtype=np.int).swapaxes(1, 0)
     else:
         prev_actions = np.asarray(prev_actions,
                                   dtype=np.float32).swapaxes(1, 0)
         mb_actions = np.asarray(mb_actions,
                                 dtype=np.float32).swapaxes(1, 0)
     mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
     mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
     mb_masks = np.asarray(mb_masks, dtype=np.bool).swapaxes(1, 0)
     last_values = self.model.value(self.obs, self.states, self.actions,
                                    self.rewards, self.dones,
                                    self.masks).tolist()
     num_trials = np.sum(mb_masks[:, 1:])
     # discount/bootstrap off value fn
     discounted_rewards = []
     for n, (rewards, masks, value) in enumerate(
             zip(mb_rewards, mb_masks[:, 1:], last_values)):
         rewards = rewards.tolist()
         masks = masks.tolist()
         if masks[-1] == 0:
             rewards = discount_with_dones(rewards + [value], masks + [0],
                                           self.gamma)[:-1]
         else:
             rewards = discount_with_dones(rewards, masks, self.gamma)
         discounted_rewards.append(rewards)
     discounted_rewards = np.asarray(discounted_rewards, dtype=np.float32)
     prev_rewards = prev_rewards.flatten()
     discounted_rewards = discounted_rewards.flatten()
     prev_actions = prev_actions.reshape(self.batch_ac_shape)
     mb_actions = mb_actions.reshape(self.batch_ac_shape)
     mb_values = mb_values.flatten()
     mb_masks = mb_masks[:, :-1]
     mb_masks = mb_masks.flatten()
     mb_dones = mb_dones.flatten()
     return mb_obs, mb_states, discounted_rewards, prev_rewards, mb_masks, prev_actions, mb_actions, mb_values, mb_dones, num_trials
Esempio n. 33
0
    def run(self):
        # We initialize the lists that will contain the mb of experiences
        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [], [], [], [], []
        # mb_test = []
        mb_states = self.states
        epinfos = []
        for n in range(self.nsteps):
            # Given observations, take action and value (V(s))
            # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init
            obs_chunks = []
            dones_chunks = []
            # l = 2 * self.m * (self.m - 1)
            # mb_test.append(list(range(n * l, (n + 1) * l)))
            for i in range(self.m):
                obs_tmp = self.obs[self.models_indexes[i]]
                dones_tmp = np.array(self.dones)[self.models_indexes[i]]
                obs_chunks.append(obs_tmp)
                dones_chunks.append(dones_tmp)

            models_results = []
            for i, model in enumerate(self.models):
                res = model.step(obs_chunks[i], S=[None], M=dones_chunks[i])
                models_results.append(res)
            # models_results = self.tp.map(self.model_step, zip(self.models, obs_chunks, [None]*self.m, dones_chunks))
            actions, values, states, _ = zip(*models_results)

            actions_to_send = np.zeros(shape=(2 * self.m * (self.m - 1)))
            values_to_send = np.zeros(shape=(2 * self.m * (self.m - 1)))
            for i in range(self.m):
                actions_to_send[self.models_indexes[i]] = actions[i]
                values_to_send[self.models_indexes[i]] = values[i]
            states = np.squeeze(states)

            # Append the experiences
            mb_obs.append(np.copy(self.obs))
            mb_actions.append(actions_to_send)
            mb_values.append(values_to_send)
            mb_dones.append(self.dones)

            # Take actions in env and look the results
            obs, rewards, dones, infos = self.env.step(actions_to_send)
            result_scores = [info['score'] for info in infos[::2]]
            self.process_winners(result_scores)
            # for info in infos:
            #     maybeepinfo = info.get('episode')
            #     if maybeepinfo: epinfos.append(maybeepinfo)
            self.states = states
            self.dones = dones
            self.obs = obs
            mb_rewards.append(rewards)
        mb_dones.append(self.dones)

        # Batch of steps to batch of rollouts
        mb_obs = np.asarray(mb_obs, dtype=self.ob_dtype).swapaxes(1, 0).reshape(self.batch_ob_shape)
        # mb_obs = np.concatenate(mb_obs)  # TODO: ho cambiato io
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
        mb_actions = np.asarray(mb_actions, dtype=self.model.train_model.action.dtype.name).swapaxes(1, 0)
        # mb_test = np.asarray(mb_test, dtype=self.model.train_model.action.dtype.name).swapaxes(1, 0)
        mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
        mb_masks = mb_dones[:, :-1]
        mb_dones = mb_dones[:, 1:]

        if self.gamma > 0.0:
            # Discount/bootstrap off value fn

            # obs_chunks = []
            # dones_chunks = []
            last_values = []
            for i, model in enumerate(self.models):
                obs_tmp = self.obs[self.models_indexes[i]]
                dones_tmp = np.array(self.dones)[self.models_indexes[i]]
                # obs_chunks.append(obs_tmp)
                # dones_chunks.append(dones_tmp)
                last_values_tmp = model.value(obs_tmp, S=[None], M=dones_tmp).tolist()
                last_values.append(last_values_tmp)
            # last_values = self.tp.map(self.model_value, zip(self.models, obs_chunks, [None]*self.m, dones_chunks))
            # actions, values, states, _ = zip(*models_results)
            last_values_to_send = np.zeros(shape=(2 * self.m * (self.m - 1)))
            for i in range(self.m):
                last_values_to_send[self.models_indexes[i]] = last_values[i]
            last_values = last_values_to_send.tolist()
            # last_values = self.model.value(self.obs, S=self.states, M=self.dones).tolist()

            for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
                rewards = rewards.tolist()
                dones = dones.tolist()
                if dones[-1] == 0:
                    rewards = discount_with_dones(rewards + [value], dones + [0], self.gamma)[:-1]
                else:
                    rewards = discount_with_dones(rewards, dones, self.gamma)

                mb_rewards[n] = rewards

        mb_actions = mb_actions.reshape(self.batch_action_shape)
        # mb_actions = mb_actions.T.flatten()  # TODO: ho cambiato io
        # mb_test = mb_test.T.flatten()  # TODO: ho cambiato io

        mb_rewards = mb_rewards.flatten()
        # mb_rewards = mb_rewards.T.flatten()  # TODO: ho cambiato io
        mb_values = mb_values.flatten()
        mb_masks = mb_masks.flatten()
        return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values, epinfos, mb_test