def request_output(self, obs): # obs must match the data structure defined in self.ds data = self.ds.flatten(obs) if self._compress: data = TensorZipper.compress(data) else: data = pickle.dumps(data) return self._request(data)
def run(self): self.replay_task = self._data_pool_apis.request_replay_task() while self.replay_task != "": game_version = self.replay_task.game_version or self._game_version self._adapt_system(game_version) if game_version != self._game_version: # need re-init replay converter self._game_version = game_version self.converter_config['game_version'] = game_version self._replay_converter = self.replay_converter_type( **self.converter_config) game_core_config = ({} if 'game_core_config' not in self.converter_config else self.converter_config['game_core_config']) extractor = ReplayExtractor( replay_dir=self._replay_dir, replay_filename=self.replay_task.replay_name, player_id=self.replay_task.player_id, replay_converter=self._replay_converter, step_mul=self._step_mul, version=game_version, game_core_config=game_core_config, da_rate=self._da_rate, unk_mmr_dft_to=self._unk_mmr_dft_to) self._steps = 0 first_frame = True if self._use_policy: self.agent.reset() self._update_agent_model() for frame in extractor.extract(): if self._post_process_data: obs, act = self._post_process_data(*frame[0]) else: obs, act = frame[0] if self._use_policy: data = (obs, act, self.agent.state, np.array(first_frame, np.bool)) self.agent.update_state(obs) first_frame = False else: data = (obs, act) data = self.ds.flatten(self.ds.structure(data)) if self._data_queue.full(): logger.log("Actor's queue is full.", level=logger.WARN) self._data_queue.put((TensorZipper.compress(data), frame[1])) logger.log('successfully put one tuple.', level=logger.DEBUG) self._steps += 1 if self._steps % self._log_interval == 0: logger.log( "%d frames of replay task [%s] sent to learner." % (self._steps, self.replay_task)) if self._use_policy and self._steps % self._update_model_freq == 0: self._update_agent_model() logger.log("Replay task [%s] done. %d frames sent to learner." % (self.replay_task, self._steps)) self.replay_task = self._data_pool_apis.request_replay_task() logger.log("All tasks done.")
def request_output(self, obs): # obs must match the data structure defined in self.ds data = self.ds.flatten(obs) if self._compress: data = TensorZipper.compress(data) else: data = pickle.dumps(data) self._req_socket.send(data) while True: try: ret = self._req_socket.recv_pyobj() break except Exception as e: print( f'Exception:{e} After {self.timeout} ms for request inference ' f'service {self.server_addr}, restart a socket and try again!' ) self._rebuild_socket() self._req_socket.send(data) return ret
def _push_data_to_learner(self, data_queue): logger.log('entering _push_data_to_learner', 'steps: {}'.format(self._steps), level=logger.DEBUG + 5) me_id = self._learning_agent_id # short name oppo_id = self._oppo_agent_id # short name # initialize last_obs, actions, reward, info, done, other_vars = data_queue.get() if self.distillation: self._update_distill_agent_model() self.distill_agent.reset(last_obs[me_id]) # loop infinitely to make the unroll on and on while True: data_model_id = self.task.model_key1 mb_rewards, mb_values, mb_dones, mb_skips = [], [], [], [] unroll = [] infos = [] mask = False while True: if last_obs[me_id] is not None: # extend the unroll until a desired length me_action = actions[me_id] if isinstance(me_action, list): me_action = tuple(me_action) # Make a `data` for this time step. The `data` is a PGData compatible # list, see the PGData definition data = [last_obs[me_id], me_action, other_vars['neglogp']] if self.rnn: # hidden state and temporal mask for rnn data.extend( [other_vars['state'], np.array(mask, np.bool)]) if self.distillation: # teacher logits head_param = (self.distill_agent.head_param( last_obs[me_id], me_action) if last_obs[me_id] is not None else None) data.append(head_param) if self.use_oppo_obs: # for fully centralized value net data.append(last_obs[oppo_id]) if self.rnn: # oppo hidden state for rnn; mask same as self_agent data.append(other_vars['oppo_state']) data = self.ds.structure(data) data.V = other_vars['v'] data.R = 0.0 # filled later by td_lambda return mb_values.append(other_vars['v']) mb_rewards.append(reward) mb_dones.append(done) # Notice: a new episode must starts with a valid obs, not None obs, # which is correct currently. Otherwise, mask will be incorrect since # it is decided by the last frame's done mask = done unroll.append(data) mb_skips.append(0) else: mb_skips[-1] += 1 mb_rewards[-1] += (self._gamma**mb_skips[-1]) * reward mb_dones[-1] += done last_obs, actions, reward, info, done, other_vars = data_queue.get( ) if done: infos.append(info) if mask and self.distillation: self._update_distill_agent_model() self.distill_agent.reset(last_obs[me_id]) if len(unroll) >= self._unroll_length and last_obs[ me_id] is not None: # need to collect a complete Noop duration break last_gae_lam = 0 for t in reversed(range(self._unroll_length)): next_values = (other_vars['v'] if t == self._unroll_length - 1 else mb_values[t + 1]) delta = (mb_rewards[t] + (self._gamma**(mb_skips[t] + 1)) * next_values * (1 - mb_dones[t]) - mb_values[t]) last_gae_lam = (delta + (self._gamma**(mb_skips[t] + 1)) * self._lam * (1 - mb_dones[t]) * last_gae_lam) unroll[t].R = np.array(last_gae_lam + mb_values[t], np.float32) compressed_unroll = [ TensorZipper.compress(self.ds.flatten(_data)) for _data in unroll ] self._learner_apis.push_data( (data_model_id, compressed_unroll, infos)) logger.log( f"Pushed one unroll to learner at time " f"{time.strftime('%Y%m%d%H%M%S')}", level=logger.DEBUG + 5)
def _push_data_to_learner(self, data_queue): logger.log('entering _push_data_to_learner', 'steps: {}'.format(self._steps), level=logger.DEBUG + 5) me_id = self._learning_agent_id # short name oppo_id = self._oppo_agent_id # short name # initialize last_obs, actions, reward, info, done, other_vars = data_queue.get() if self.distillation: self._update_distill_agent_model() self.distill_agent.reset(last_obs[me_id]) if self.use_oppo_obs: value, state, neglogpac, oppo_state = other_vars else: value, state, neglogpac = other_vars oppo_state = None # loop infinitely to make the unroll on and on while True: data_model_id = self.task.model_key1 mb_skips = [] unroll = [] infos = [] mask = False while True: if last_obs[me_id] is not None: # extend the unroll until a desired length me_action = actions[me_id] if isinstance(me_action, list): me_action = tuple(me_action) # Make a `data` for this time step. The `data` is a PGData compatible # list, see the PGData definition data = [last_obs[me_id], me_action, neglogpac] if self.rnn: # hidden state and temporal mask for rnn data.extend([state, np.array(mask, np.bool)]) if self.distillation: # teacher logits logits = (self.distill_agent.logits( last_obs[me_id], me_action) if last_obs[me_id] is not None else None) data.append(logits) if self.use_oppo_obs: # for fully centralized value net data.append(last_obs[oppo_id]) if self.rnn: # oppo hidden state for rnn; mask same as self_agent data.append(oppo_state) data = self.ds.structure(data) data.r = reward data.discount = 1.0 # Notice: a new episode must starts with a valid obs, not None obs, # which is correct currently. Otherwise, mask will be incorrect since # it is decided by the last frame's done mask = done unroll.append(data) mb_skips.append(0) else: mb_skips[-1] += 1 # correct cumulated reward and discount factor data.r += (self._gamma**mb_skips[-1]) * reward data.discount *= (1 - done) * self._gamma last_obs, actions, reward, info, done, other_vars = data_queue.get( ) if self.use_oppo_obs: value, state, neglogpac, oppo_state = other_vars else: value, state, neglogpac = other_vars if done: info = deepcopy(info) info['outcome'] = self.log_outcome(info) infos.append(info) if mask and self.distillation: self._update_distill_agent_model() self.distill_agent.reset(last_obs[me_id]) if len(unroll) >= self._unroll_length and last_obs[ me_id] is not None: # need to collect a complete Noop duration break compressed_unroll = [ TensorZipper.compress(self.ds.flatten(_data)) for _data in unroll ] self._learner_apis.push_data( (data_model_id, compressed_unroll, infos)) logger.log( f"Pushed one unroll to learner at time " f"{time.strftime('%Y%m%d%H%M%S')}", level=logger.DEBUG + 5)