def log_kvs(self, reward_sum, info): time_end = time.time() logger.logkvs({ 'producing_fps': self._steps / (time_end - self.time_beg), 'reward_sum': reward_sum, 'episode_steps': self._steps, }) if self.should_log_info: # log additional info fields logger.logkvs(info) logger.dumpkvs()
def _update_timing_logkvs(self, n_batches): tnow = time.time() consuming_fps = int(self._batch_size * n_batches / (tnow - self.tstart)) time_elapsed = tnow - self.tfirststart total_samples_now = self._total_samples() receiving_fps = ((sum(total_samples_now) - sum(self.total_samples)) / (tnow - self.tstart)) self.total_samples = total_samples_now self.tstart = time.time() logger.logkvs({ "all_consuming_fps": consuming_fps, "time_elapsed": time_elapsed, "train_samples": self.total_samples[0], "val_samples": self.total_samples[1], "receiving_fps": receiving_fps, })
def _rollout_an_episode(self): """ perform roullout until one episode done. Data are put in _data_queue, which will be sent to remote in a separate thread """ self._steps = 0 self.task = self._request_task() logger.log('episode begins, task: {}'.format(str(self.task))) #obs = self.env.reset() obs = self.obs_space.sample() for agt, ob in zip(self.agents, obs): agt.reset(ob) self._update_agents_model(self.task) me_id = self._learning_agent_id # short name reward_sum = 0.0 time_beg = time.time() self._update_hyperparam(self.task) while True: self._steps += 1 output = self.agents[me_id].act(obs[me_id]) action, other_vars = output[0], output[1:] oppo_actions = [ agt.step(ob) for agt, ob in zip(self.agents[me_id + 1:], obs[me_id + 1:]) ] last_obs = obs obs = self.obs_space.sample() reward = [np.zeros(shape=(17, )), np.zeros(shape=(17, ))] done = self._steps == 300 info = {'outcome': [0, 0]} #obs, reward, done, info = self.env.step([action] + oppo_actions) rwd = self._reward_shape(reward[me_id]) reward_sum += rwd if self._enable_push: if self._data_queue.full(): logger.log("Actor's queue is full.", level=logger.WARN) rwd_to_push = rwd if self.rwd_shape else reward[me_id] self._data_queue.put((last_obs, tuple([action] + oppo_actions), rwd_to_push, info, done, other_vars)) logger.log('successfully put one tuple.', level=logger.DEBUG) if self._steps % self._log_interval_steps == 0: logger.log('_rollout_an_episode,', 'steps: {},'.format(self._steps), 'data qsize: {}'.format(self._data_queue.qsize())) if done: time_end = time.time() logger.logkvs({ 'producing_fps': self._steps / (time_end - time_beg), 'reward_sum': reward_sum, 'episode_steps': self._steps, }) if self.should_log_info: # log additional info fields logger.logkvs(info) logger.dumpkvs() if 'outcome' not in info: me_outcome = -95678 logger.log("info['outcome'] not available", 'return an arbitrary value', me_outcome, level=logger.WARN) else: me_outcome = info['outcome'][me_id] return me_outcome if self._update_model_freq and self._steps % self._update_model_freq == 0: if (self._enable_push and self._remote.pull_model_attr( 'freezetime', self.task.model_key1) is not None): # Current task (learning period) finishes, start a new task or continue self._finish_task(self.task, None) # notify early abort last_task = self.task self.task = self._request_task() # try to continue if not is_inherit(last_task.model_key1, self.task.model_key1): time_end = time.time() logger.logkvs({ 'producing_fps': self._steps / (time_end - time_beg), 'reward_sum': reward_sum, 'episode_steps': self._steps, }) if self.should_log_info: # log additional info fields logger.logkvs(info) logger.dumpkvs() return None self._update_agents_model(self.task)
def _run_train_loop(self, nbatch): lr = as_func(self.task.hyperparam.learning_rate) cliprange = as_func(self.task.hyperparam.cliprange) lam = self.task.hyperparam.lam # lambda for the td-lambda term weights = None if self.rwd_shape: assert hasattr(self.task.hyperparam, 'reward_weights') weights = np.array(self.task.hyperparam.reward_weights, dtype=np.float32) if len(weights.shape) == 1: weights = np.expand_dims(weights, 0) self.total_timesteps = getattr(self.task.hyperparam, 'total_timesteps', self.total_timesteps) burn_in_timesteps = 0 if self._need_burn_in: burn_in_timesteps = getattr(self.task.hyperparam, 'burn_in_timesteps', self.burn_in_timesteps) nupdates_burn_in = int(burn_in_timesteps // nbatch) nupdates = nupdates_burn_in + int(self.total_timesteps // nbatch) mblossvals = [] tfirststart = time.time() tstart = time.time() total_samples = self._data_server.unroll_num * self.unroll_length logger.log('Start Training') for update in xrange(1, nupdates + 1): frac = 1.0 - (update - 1.0) / nupdates lrnow = lr(frac) cliprangenow = cliprange(frac) if update <= nupdates_burn_in: mblossvals.append( self.burn_in(lrnow, cliprangenow, lam, weights)) else: mblossvals.append( self.train_batch(lrnow, cliprangenow, lam, weights)) # publish models if update % self.pub_interval == 0 and self.should_push_model: self._model_pool_apis.push_model( self.read_params(), self.task.hyperparam, self.model_key, learner_meta=self.read_opt_params()) # logging stuff if update % self.log_interval == 0 or update == 1: lossvals = np.mean(mblossvals, axis=0) mblossvals = [] tnow = time.time() consuming_fps = int(nbatch * min(update, self.log_interval) / (tnow - tstart)) time_elapsed = tnow - tfirststart total_samples_now = self._data_server.unroll_num * self.unroll_length receiving_fps = (total_samples_now - total_samples) / (tnow - tstart) total_samples = total_samples_now tstart = time.time() # 'scope_name/var' style for grouping Tab in Tensorboard webpage # lp is short for Learning Period scope = 'lp{}/'.format(self._lrn_period_count) logger.logkvs({ scope + "lrn_period_count": self._lrn_period_count, scope + "burn_in_value": update <= nupdates_burn_in, scope + "nupdates": update, scope + "total_timesteps": update * nbatch, scope + "all_consuming_fps": consuming_fps, scope + 'time_elapsed': time_elapsed, scope + "total_samples": total_samples, scope + "receiving_fps": receiving_fps, scope + "aband_samples": (self._data_server.aband_unroll_num * self.unroll_length) }) logger.logkvs({ scope + lossname: lossval for lossname, lossval in zip(self.loss_names, lossvals) }) logger.dumpkvs() if self.save_interval and (update % self.save_interval == 0 or update == 1) and logger.get_dir(): checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % update) logger.log('Saving log to', savepath) self.save(savepath) if self.should_push_model: self._model_pool_apis.push_model( self.read_params(), self.task.hyperparam, self.model_key, learner_meta=self.read_opt_params()) return
def run(self): if self.should_push_model: self._model_pool_apis.push_model( self.read_params(), None, self.model_key, learner_meta=self.read_opt_params()) self.tstart = time.time() self.tfirststart = self.tstart self.total_samples = self._total_samples() train_fetches_list, elapsed_time = [], 0 for i in range(self._num_sgd_updates): # checkpoint stuff (saving, validation, etc.) if i % self._checkpoint_interval == 0: if self._checkpoints_dir is not None: self._saver._save_model_checkpoint(self._checkpoints_dir, "checkpoint_%s" % i) if self._enable_validation: # TODO(pengsun): completely disable validation when not using while not self.data_pool.ready_for_val: time.sleep(5) if self.use_hvd: self.barrier() # synchronize across all hvd learners # do validation and logging t = time.time() val_endpoints = self._validate() if self.rank == 0: with logger.scoped_configure(logger=self.val_logger): logger.logkvs({ "n_update": i, "Elapsed Time": time.time() - t }) logger.logkvs( dict(zip(self._val_log_names, val_endpoints))) logger.dumpkvs() while not self.data_pool.ready_for_train: time.sleep(5) if self.use_hvd: self.barrier() # synchronize across all hvd learners # publish stuff (publish NN model) if i % self.pub_interval == 0 and self.should_push_model: self._model_pool_apis.push_model( self.read_params(), None, self.model_key, learner_meta=self.read_opt_params()) # train one step and logging train_fetches = self._train_step() train_fetches_list.append(train_fetches) if len(train_fetches_list) >= self._print_interval: if self.rank == 0: train_averaged_fetches = _reduce_mean_axis_zero( train_fetches_list) logger.logkvs({ "n_update": i, }) self._update_timing_logkvs(n_batches=self._print_interval) logger.logkvs({ name: item for name, item in zip(self._train_log_names, train_averaged_fetches) }) logger.dumpkvs() train_fetches_list = []