def request_output(self, obs): # obs must match the data structure defined in self.ds data = self.ds.flatten(obs) if self._compress: data = TensorZipper.compress(data) else: data = pickle.dumps(data) return self._request(data)
def run(self): self.replay_task = self._data_pool_apis.request_replay_task() while self.replay_task != "": game_version = self.replay_task.game_version or self._game_version self._adapt_system(game_version) if game_version != self._game_version: # need re-init replay converter self._game_version = game_version self.converter_config['game_version'] = game_version self._replay_converter = self.replay_converter_type( **self.converter_config) game_core_config = ({} if 'game_core_config' not in self.converter_config else self.converter_config['game_core_config']) extractor = ReplayExtractor( replay_dir=self._replay_dir, replay_filename=self.replay_task.replay_name, player_id=self.replay_task.player_id, replay_converter=self._replay_converter, step_mul=self._step_mul, version=game_version, game_core_config=game_core_config, da_rate=self._da_rate, unk_mmr_dft_to=self._unk_mmr_dft_to) self._steps = 0 first_frame = True if self._use_policy: self.agent.reset() self._update_agent_model() for frame in extractor.extract(): if self._post_process_data: obs, act = self._post_process_data(*frame[0]) else: obs, act = frame[0] if self._use_policy: data = (obs, act, self.agent.state, np.array(first_frame, np.bool)) self.agent.update_state(obs) first_frame = False else: data = (obs, act) data = self.ds.flatten(self.ds.structure(data)) if self._data_queue.full(): logger.log("Actor's queue is full.", level=logger.WARN) self._data_queue.put((TensorZipper.compress(data), frame[1])) logger.log('successfully put one tuple.', level=logger.DEBUG) self._steps += 1 if self._steps % self._log_interval == 0: logger.log( "%d frames of replay task [%s] sent to learner." % (self._steps, self.replay_task)) if self._use_policy and self._steps % self._update_model_freq == 0: self._update_agent_model() logger.log("Replay task [%s] done. %d frames sent to learner." % (self.replay_task, self._steps)) self.replay_task = self._data_pool_apis.request_replay_task() logger.log("All tasks done.")
def data_generator(self): pull_socket = self._zmq_context.socket(zmq.PULL) pull_socket.connect(self.req_ep) while True: msg = pull_socket.recv_multipart() if self._compress: data = TensorZipper.decompress(msg[-1]) else: data = pickle.loads(msg[-1]) yield data + (msg[0], )
def data_generator(self): while True: while True: try: msg = self._data_queue.get_nowait() break except: time.sleep(0.01) if self._compress: data = TensorZipper.decompress(msg[-1]) else: data = pickle.loads(msg[-1]) yield data + (msg[0].bytes, )
def request_output(self, obs): # obs must match the data structure defined in self.ds data = self.ds.flatten(obs) if self._compress: data = TensorZipper.compress(data) else: data = pickle.dumps(data) self._req_socket.send(data) while True: try: ret = self._req_socket.recv_pyobj() break except Exception as e: print( f'Exception:{e} After {self.timeout} ms for request inference ' f'service {self.server_addr}, restart a socket and try again!' ) self._rebuild_socket() self._req_socket.send(data) return ret
def _push_data_to_learner(self, data_queue): logger.log('entering _push_data_to_learner', 'steps: {}'.format(self._steps), level=logger.DEBUG + 5) me_id = self._learning_agent_id # short name oppo_id = self._oppo_agent_id # short name # initialize last_obs, actions, reward, info, done, other_vars = data_queue.get() if self.distillation: self._update_distill_agent_model() self.distill_agent.reset(last_obs[me_id]) # loop infinitely to make the unroll on and on while True: data_model_id = self.task.model_key1 mb_rewards, mb_values, mb_dones, mb_skips = [], [], [], [] unroll = [] infos = [] mask = False while True: if last_obs[me_id] is not None: # extend the unroll until a desired length me_action = actions[me_id] if isinstance(me_action, list): me_action = tuple(me_action) # Make a `data` for this time step. The `data` is a PGData compatible # list, see the PGData definition data = [last_obs[me_id], me_action, other_vars['neglogp']] if self.rnn: # hidden state and temporal mask for rnn data.extend( [other_vars['state'], np.array(mask, np.bool)]) if self.distillation: # teacher logits head_param = (self.distill_agent.head_param( last_obs[me_id], me_action) if last_obs[me_id] is not None else None) data.append(head_param) if self.use_oppo_obs: # for fully centralized value net data.append(last_obs[oppo_id]) if self.rnn: # oppo hidden state for rnn; mask same as self_agent data.append(other_vars['oppo_state']) data = self.ds.structure(data) data.V = other_vars['v'] data.R = 0.0 # filled later by td_lambda return mb_values.append(other_vars['v']) mb_rewards.append(reward) mb_dones.append(done) # Notice: a new episode must starts with a valid obs, not None obs, # which is correct currently. Otherwise, mask will be incorrect since # it is decided by the last frame's done mask = done unroll.append(data) mb_skips.append(0) else: mb_skips[-1] += 1 mb_rewards[-1] += (self._gamma**mb_skips[-1]) * reward mb_dones[-1] += done last_obs, actions, reward, info, done, other_vars = data_queue.get( ) if done: infos.append(info) if mask and self.distillation: self._update_distill_agent_model() self.distill_agent.reset(last_obs[me_id]) if len(unroll) >= self._unroll_length and last_obs[ me_id] is not None: # need to collect a complete Noop duration break last_gae_lam = 0 for t in reversed(range(self._unroll_length)): next_values = (other_vars['v'] if t == self._unroll_length - 1 else mb_values[t + 1]) delta = (mb_rewards[t] + (self._gamma**(mb_skips[t] + 1)) * next_values * (1 - mb_dones[t]) - mb_values[t]) last_gae_lam = (delta + (self._gamma**(mb_skips[t] + 1)) * self._lam * (1 - mb_dones[t]) * last_gae_lam) unroll[t].R = np.array(last_gae_lam + mb_values[t], np.float32) compressed_unroll = [ TensorZipper.compress(self.ds.flatten(_data)) for _data in unroll ] self._learner_apis.push_data( (data_model_id, compressed_unroll, infos)) logger.log( f"Pushed one unroll to learner at time " f"{time.strftime('%Y%m%d%H%M%S')}", level=logger.DEBUG + 5)
def _push_data_to_learner(self, data_queue): logger.log('entering _push_data_to_learner', 'steps: {}'.format(self._steps), level=logger.DEBUG + 5) me_id = self._learning_agent_id # short name oppo_id = self._oppo_agent_id # short name # initialize last_obs, actions, reward, info, done, other_vars = data_queue.get() if self.distillation: self._update_distill_agent_model() self.distill_agent.reset(last_obs[me_id]) if self.use_oppo_obs: value, state, neglogpac, oppo_state = other_vars else: value, state, neglogpac = other_vars oppo_state = None # loop infinitely to make the unroll on and on while True: data_model_id = self.task.model_key1 mb_skips = [] unroll = [] infos = [] mask = False while True: if last_obs[me_id] is not None: # extend the unroll until a desired length me_action = actions[me_id] if isinstance(me_action, list): me_action = tuple(me_action) # Make a `data` for this time step. The `data` is a PGData compatible # list, see the PGData definition data = [last_obs[me_id], me_action, neglogpac] if self.rnn: # hidden state and temporal mask for rnn data.extend([state, np.array(mask, np.bool)]) if self.distillation: # teacher logits logits = (self.distill_agent.logits( last_obs[me_id], me_action) if last_obs[me_id] is not None else None) data.append(logits) if self.use_oppo_obs: # for fully centralized value net data.append(last_obs[oppo_id]) if self.rnn: # oppo hidden state for rnn; mask same as self_agent data.append(oppo_state) data = self.ds.structure(data) data.r = reward data.discount = 1.0 # Notice: a new episode must starts with a valid obs, not None obs, # which is correct currently. Otherwise, mask will be incorrect since # it is decided by the last frame's done mask = done unroll.append(data) mb_skips.append(0) else: mb_skips[-1] += 1 # correct cumulated reward and discount factor data.r += (self._gamma**mb_skips[-1]) * reward data.discount *= (1 - done) * self._gamma last_obs, actions, reward, info, done, other_vars = data_queue.get( ) if self.use_oppo_obs: value, state, neglogpac, oppo_state = other_vars else: value, state, neglogpac = other_vars if done: info = deepcopy(info) info['outcome'] = self.log_outcome(info) infos.append(info) if mask and self.distillation: self._update_distill_agent_model() self.distill_agent.reset(last_obs[me_id]) if len(unroll) >= self._unroll_length and last_obs[ me_id] is not None: # need to collect a complete Noop duration break compressed_unroll = [ TensorZipper.compress(self.ds.flatten(_data)) for _data in unroll ] self._learner_apis.push_data( (data_model_id, compressed_unroll, infos)) logger.log( f"Pushed one unroll to learner at time " f"{time.strftime('%Y%m%d%H%M%S')}", level=logger.DEBUG + 5)
def _decode_sample(self, sample): return TensorZipper.decompress(sample)
def main(_): policy = "tpolicies.net_zoo.mnet_v6.mnet_v6d6" policy_config = { 'use_xla': True, 'test': False, 'use_loss_type': 'none', 'use_value_head': False, 'use_self_fed_heads': True, 'use_lstm': True, 'nlstm': 256, 'hs_len': 256 * 2, 'lstm_duration': 1, 'lstm_dropout_rate': 0.0, 'lstm_cell_type': 'lstm', 'lstm_layer_norm': True, 'weight_decay': 0.00002, 'arg_scope_type': 'type_b', 'endpoints_verbosity': 10, 'n_v': 7, 'distillation': True, 'fix_all_embed': False, 'use_base_mask': True, 'zstat_embed_version': 'v3', 'sync_statistics': 'horovod', 'temperature': 0.8, 'merge_pi': False, } converter_config = { 'zstat_data_src': '/root/replay_ds/rp1522-mv-zstat', 'input_map_size': (128, 128), 'output_map_size': (128, 128), 'delete_useless_selection': False, 'dict_space': True, 'max_bo_count': 50, 'max_bobt_count': 20, 'zstat_zeroing_prob': 0.1, 'zmaker_version': 'v5', } policy = import_module_or_data(policy) replay_converter_name = "timitate.lib6.pb2all_converter.PB2AllConverter" converter_module, converter_name = replay_converter_name.rsplit(".", 1) replay_converter_type = getattr(importlib.import_module(converter_module), converter_name) replay_converter = replay_converter_type(**converter_config) ob_space, ac_space = replay_converter.space rnn = (False if 'use_lstm' not in policy_config else policy_config['use_lstm']) hs_len = (policy_config['hs_len'] if ('hs_len' in policy_config) else 2 * policy_config['nlstm'] if ('nlstm' in policy_config) else 128) ds = InfData(ob_space, ac_space, policy_config['use_self_fed_heads'], rnn, hs_len) cached_ds = ILData(ob_space, ac_space, rnn, hs_len) if FLAGS.role == 'Server': S = InfServer(None, None, FLAGS.port, ds, FLAGS.batch_size, ob_space, ac_space, policy, policy_config=policy_config, gpu_id=FLAGS.gpu_id, pull_worker_num=FLAGS.pull_worker_num) S.run() elif FLAGS.role == 'Actor': data = pickle.load(open('data', 'rb')) data_set = [ cached_ds.make_structure(TensorZipper.decompress(d)) for d in data ] data_set = [ds.structure(d.X, d.S, d.M) for d in data_set] n = len(data_set) policy_config['batch_size'] = 1 policy_config['rollout_len'] = 1 policy_config['use_loss_type'] = 'none' if FLAGS.use_gpu_server: from tleague.actors.agent import PGAgentGPU agent = PGAgentGPU(FLAGS.server_addr, ds, hs_len) else: from tleague.actors.agent import PGAgent2 agent = PGAgent2(policy, ob_space, ac_space, policy_config=policy_config) while True: t0 = time.time() for sample in data_set: pred = agent.step(sample.X) # print(pred['A_AB']) cost = time.time() - t0 print('Predict {} samples costs {} seconds, fps {}.'.format( n, cost, n / cost), flush=True)
def data_generator(): while True: while not rm.ready_for_sample(): time.sleep(5) for sample, weight in rm.sample_rollout(): yield (TensorZipper.decompress(sample), weight)