Exemple #1
0
 def request_output(self, obs):
     # obs must match the data structure defined in self.ds
     data = self.ds.flatten(obs)
     if self._compress:
         data = TensorZipper.compress(data)
     else:
         data = pickle.dumps(data)
     return self._request(data)
Exemple #2
0
 def run(self):
     self.replay_task = self._data_pool_apis.request_replay_task()
     while self.replay_task != "":
         game_version = self.replay_task.game_version or self._game_version
         self._adapt_system(game_version)
         if game_version != self._game_version:
             # need re-init replay converter
             self._game_version = game_version
             self.converter_config['game_version'] = game_version
             self._replay_converter = self.replay_converter_type(
                 **self.converter_config)
         game_core_config = ({} if 'game_core_config'
                             not in self.converter_config else
                             self.converter_config['game_core_config'])
         extractor = ReplayExtractor(
             replay_dir=self._replay_dir,
             replay_filename=self.replay_task.replay_name,
             player_id=self.replay_task.player_id,
             replay_converter=self._replay_converter,
             step_mul=self._step_mul,
             version=game_version,
             game_core_config=game_core_config,
             da_rate=self._da_rate,
             unk_mmr_dft_to=self._unk_mmr_dft_to)
         self._steps = 0
         first_frame = True
         if self._use_policy:
             self.agent.reset()
             self._update_agent_model()
         for frame in extractor.extract():
             if self._post_process_data:
                 obs, act = self._post_process_data(*frame[0])
             else:
                 obs, act = frame[0]
             if self._use_policy:
                 data = (obs, act, self.agent.state,
                         np.array(first_frame, np.bool))
                 self.agent.update_state(obs)
                 first_frame = False
             else:
                 data = (obs, act)
             data = self.ds.flatten(self.ds.structure(data))
             if self._data_queue.full():
                 logger.log("Actor's queue is full.", level=logger.WARN)
             self._data_queue.put((TensorZipper.compress(data), frame[1]))
             logger.log('successfully put one tuple.', level=logger.DEBUG)
             self._steps += 1
             if self._steps % self._log_interval == 0:
                 logger.log(
                     "%d frames of replay task [%s] sent to learner." %
                     (self._steps, self.replay_task))
             if self._use_policy and self._steps % self._update_model_freq == 0:
                 self._update_agent_model()
         logger.log("Replay task [%s] done. %d frames sent to learner." %
                    (self.replay_task, self._steps))
         self.replay_task = self._data_pool_apis.request_replay_task()
     logger.log("All tasks done.")
Exemple #3
0
 def request_output(self, obs):
     # obs must match the data structure defined in self.ds
     data = self.ds.flatten(obs)
     if self._compress:
         data = TensorZipper.compress(data)
     else:
         data = pickle.dumps(data)
     self._req_socket.send(data)
     while True:
         try:
             ret = self._req_socket.recv_pyobj()
             break
         except Exception as e:
             print(
                 f'Exception:{e} After {self.timeout} ms for request inference '
                 f'service {self.server_addr}, restart a socket and try again!'
             )
             self._rebuild_socket()
             self._req_socket.send(data)
     return ret
Exemple #4
0
    def _push_data_to_learner(self, data_queue):
        logger.log('entering _push_data_to_learner',
                   'steps: {}'.format(self._steps),
                   level=logger.DEBUG + 5)
        me_id = self._learning_agent_id  # short name
        oppo_id = self._oppo_agent_id  # short name

        # initialize
        last_obs, actions, reward, info, done, other_vars = data_queue.get()
        if self.distillation:
            self._update_distill_agent_model()
            self.distill_agent.reset(last_obs[me_id])

        # loop infinitely to make the unroll on and on
        while True:
            data_model_id = self.task.model_key1
            mb_rewards, mb_values, mb_dones, mb_skips = [], [], [], []
            unroll = []
            infos = []
            mask = False
            while True:
                if last_obs[me_id] is not None:
                    # extend the unroll until a desired length
                    me_action = actions[me_id]
                    if isinstance(me_action, list):
                        me_action = tuple(me_action)
                    # Make a `data` for this time step. The `data` is a PGData compatible
                    # list, see the PGData definition
                    data = [last_obs[me_id], me_action, other_vars['neglogp']]
                    if self.rnn:
                        # hidden state and temporal mask for rnn
                        data.extend(
                            [other_vars['state'],
                             np.array(mask, np.bool)])
                    if self.distillation:
                        # teacher logits
                        head_param = (self.distill_agent.head_param(
                            last_obs[me_id], me_action)
                                      if last_obs[me_id] is not None else None)
                        data.append(head_param)
                    if self.use_oppo_obs:
                        # for fully centralized value net
                        data.append(last_obs[oppo_id])
                        if self.rnn:
                            # oppo hidden state for rnn; mask same as self_agent
                            data.append(other_vars['oppo_state'])
                    data = self.ds.structure(data)
                    data.V = other_vars['v']
                    data.R = 0.0  # filled later by td_lambda return
                    mb_values.append(other_vars['v'])
                    mb_rewards.append(reward)
                    mb_dones.append(done)
                    # Notice: a new episode must starts with a valid obs, not None obs,
                    # which is correct currently. Otherwise, mask will be incorrect since
                    # it is decided by the last frame's done
                    mask = done
                    unroll.append(data)
                    mb_skips.append(0)
                else:
                    mb_skips[-1] += 1
                    mb_rewards[-1] += (self._gamma**mb_skips[-1]) * reward
                    mb_dones[-1] += done

                last_obs, actions, reward, info, done, other_vars = data_queue.get(
                )
                if done:
                    infos.append(info)
                if mask and self.distillation:
                    self._update_distill_agent_model()
                    self.distill_agent.reset(last_obs[me_id])

                if len(unroll) >= self._unroll_length and last_obs[
                        me_id] is not None:
                    # need to collect a complete Noop duration
                    break

            last_gae_lam = 0
            for t in reversed(range(self._unroll_length)):
                next_values = (other_vars['v'] if t == self._unroll_length -
                               1 else mb_values[t + 1])
                delta = (mb_rewards[t] +
                         (self._gamma**(mb_skips[t] + 1)) * next_values *
                         (1 - mb_dones[t]) - mb_values[t])
                last_gae_lam = (delta +
                                (self._gamma**(mb_skips[t] + 1)) * self._lam *
                                (1 - mb_dones[t]) * last_gae_lam)
                unroll[t].R = np.array(last_gae_lam + mb_values[t], np.float32)
            compressed_unroll = [
                TensorZipper.compress(self.ds.flatten(_data))
                for _data in unroll
            ]
            self._learner_apis.push_data(
                (data_model_id, compressed_unroll, infos))
            logger.log(
                f"Pushed one unroll to learner at time "
                f"{time.strftime('%Y%m%d%H%M%S')}",
                level=logger.DEBUG + 5)
Exemple #5
0
    def _push_data_to_learner(self, data_queue):
        logger.log('entering _push_data_to_learner',
                   'steps: {}'.format(self._steps),
                   level=logger.DEBUG + 5)
        me_id = self._learning_agent_id  # short name
        oppo_id = self._oppo_agent_id  # short name

        # initialize
        last_obs, actions, reward, info, done, other_vars = data_queue.get()
        if self.distillation:
            self._update_distill_agent_model()
            self.distill_agent.reset(last_obs[me_id])
        if self.use_oppo_obs:
            value, state, neglogpac, oppo_state = other_vars
        else:
            value, state, neglogpac = other_vars
            oppo_state = None

        # loop infinitely to make the unroll on and on
        while True:
            data_model_id = self.task.model_key1
            mb_skips = []
            unroll = []
            infos = []
            mask = False
            while True:
                if last_obs[me_id] is not None:
                    # extend the unroll until a desired length
                    me_action = actions[me_id]
                    if isinstance(me_action, list):
                        me_action = tuple(me_action)
                    # Make a `data` for this time step. The `data` is a PGData compatible
                    # list, see the PGData definition
                    data = [last_obs[me_id], me_action, neglogpac]
                    if self.rnn:
                        # hidden state and temporal mask for rnn
                        data.extend([state, np.array(mask, np.bool)])
                    if self.distillation:
                        # teacher logits
                        logits = (self.distill_agent.logits(
                            last_obs[me_id], me_action)
                                  if last_obs[me_id] is not None else None)
                        data.append(logits)
                    if self.use_oppo_obs:
                        # for fully centralized value net
                        data.append(last_obs[oppo_id])
                        if self.rnn:
                            # oppo hidden state for rnn; mask same as self_agent
                            data.append(oppo_state)
                    data = self.ds.structure(data)
                    data.r = reward
                    data.discount = 1.0
                    # Notice: a new episode must starts with a valid obs, not None obs,
                    # which is correct currently. Otherwise, mask will be incorrect since
                    # it is decided by the last frame's done
                    mask = done
                    unroll.append(data)
                    mb_skips.append(0)
                else:
                    mb_skips[-1] += 1
                    # correct cumulated reward and discount factor
                    data.r += (self._gamma**mb_skips[-1]) * reward
                    data.discount *= (1 - done) * self._gamma

                last_obs, actions, reward, info, done, other_vars = data_queue.get(
                )
                if self.use_oppo_obs:
                    value, state, neglogpac, oppo_state = other_vars
                else:
                    value, state, neglogpac = other_vars
                if done:
                    info = deepcopy(info)
                    info['outcome'] = self.log_outcome(info)
                    infos.append(info)
                if mask and self.distillation:
                    self._update_distill_agent_model()
                    self.distill_agent.reset(last_obs[me_id])

                if len(unroll) >= self._unroll_length and last_obs[
                        me_id] is not None:
                    # need to collect a complete Noop duration
                    break

            compressed_unroll = [
                TensorZipper.compress(self.ds.flatten(_data))
                for _data in unroll
            ]
            self._learner_apis.push_data(
                (data_model_id, compressed_unroll, infos))
            logger.log(
                f"Pushed one unroll to learner at time "
                f"{time.strftime('%Y%m%d%H%M%S')}",
                level=logger.DEBUG + 5)