Esempio n. 1
0
 def log_kvs(self, reward_sum, info):
     time_end = time.time()
     logger.logkvs({
         'producing_fps': self._steps / (time_end - self.time_beg),
         'reward_sum': reward_sum,
         'episode_steps': self._steps,
     })
     if self.should_log_info:  # log additional info fields
         logger.logkvs(info)
     logger.dumpkvs()
Esempio n. 2
0
    def _rollout_an_episode(self):
        """ perform roullout until one episode done. Data are put in _data_queue,
     which will be sent to remote in a separate thread """
        self._steps = 0

        self.task = self._request_task()
        logger.log('episode begins, task: {}'.format(str(self.task)))

        #obs = self.env.reset()
        obs = self.obs_space.sample()
        for agt, ob in zip(self.agents, obs):
            agt.reset(ob)
        self._update_agents_model(self.task)

        me_id = self._learning_agent_id  # short name
        reward_sum = 0.0
        time_beg = time.time()
        self._update_hyperparam(self.task)
        while True:
            self._steps += 1
            output = self.agents[me_id].act(obs[me_id])
            action, other_vars = output[0], output[1:]
            oppo_actions = [
                agt.step(ob)
                for agt, ob in zip(self.agents[me_id + 1:], obs[me_id + 1:])
            ]
            last_obs = obs
            obs = self.obs_space.sample()
            reward = [np.zeros(shape=(17, )), np.zeros(shape=(17, ))]
            done = self._steps == 300
            info = {'outcome': [0, 0]}
            #obs, reward, done, info = self.env.step([action] + oppo_actions)

            rwd = self._reward_shape(reward[me_id])
            reward_sum += rwd
            if self._enable_push:
                if self._data_queue.full():
                    logger.log("Actor's queue is full.", level=logger.WARN)
                rwd_to_push = rwd if self.rwd_shape else reward[me_id]
                self._data_queue.put((last_obs, tuple([action] + oppo_actions),
                                      rwd_to_push, info, done, other_vars))
                logger.log('successfully put one tuple.', level=logger.DEBUG)

            if self._steps % self._log_interval_steps == 0:
                logger.log('_rollout_an_episode,',
                           'steps: {},'.format(self._steps),
                           'data qsize: {}'.format(self._data_queue.qsize()))

            if done:
                time_end = time.time()
                logger.logkvs({
                    'producing_fps':
                    self._steps / (time_end - time_beg),
                    'reward_sum':
                    reward_sum,
                    'episode_steps':
                    self._steps,
                })
                if self.should_log_info:  # log additional info fields
                    logger.logkvs(info)
                logger.dumpkvs()
                if 'outcome' not in info:
                    me_outcome = -95678
                    logger.log("info['outcome'] not available",
                               'return an arbitrary value',
                               me_outcome,
                               level=logger.WARN)
                else:
                    me_outcome = info['outcome'][me_id]
                return me_outcome

            if self._update_model_freq and self._steps % self._update_model_freq == 0:
                if (self._enable_push and self._remote.pull_model_attr(
                        'freezetime', self.task.model_key1) is not None):
                    # Current task (learning period) finishes, start a new task or continue
                    self._finish_task(self.task, None)  # notify early abort
                    last_task = self.task
                    self.task = self._request_task()  # try to continue
                    if not is_inherit(last_task.model_key1,
                                      self.task.model_key1):
                        time_end = time.time()
                        logger.logkvs({
                            'producing_fps':
                            self._steps / (time_end - time_beg),
                            'reward_sum':
                            reward_sum,
                            'episode_steps':
                            self._steps,
                        })
                        if self.should_log_info:  # log additional info fields
                            logger.logkvs(info)
                        logger.dumpkvs()
                        return None
                self._update_agents_model(self.task)
Esempio n. 3
0
 def _run_train_loop(self, nbatch):
     lr = as_func(self.task.hyperparam.learning_rate)
     cliprange = as_func(self.task.hyperparam.cliprange)
     lam = self.task.hyperparam.lam  # lambda for the td-lambda term
     weights = None
     if self.rwd_shape:
         assert hasattr(self.task.hyperparam, 'reward_weights')
         weights = np.array(self.task.hyperparam.reward_weights,
                            dtype=np.float32)
         if len(weights.shape) == 1:
             weights = np.expand_dims(weights, 0)
     self.total_timesteps = getattr(self.task.hyperparam, 'total_timesteps',
                                    self.total_timesteps)
     burn_in_timesteps = 0
     if self._need_burn_in:
         burn_in_timesteps = getattr(self.task.hyperparam,
                                     'burn_in_timesteps',
                                     self.burn_in_timesteps)
     nupdates_burn_in = int(burn_in_timesteps // nbatch)
     nupdates = nupdates_burn_in + int(self.total_timesteps // nbatch)
     mblossvals = []
     tfirststart = time.time()
     tstart = time.time()
     total_samples = self._data_server.unroll_num * self.unroll_length
     logger.log('Start Training')
     for update in xrange(1, nupdates + 1):
         frac = 1.0 - (update - 1.0) / nupdates
         lrnow = lr(frac)
         cliprangenow = cliprange(frac)
         if update <= nupdates_burn_in:
             mblossvals.append(
                 self.burn_in(lrnow, cliprangenow, lam, weights))
         else:
             mblossvals.append(
                 self.train_batch(lrnow, cliprangenow, lam, weights))
         # publish models
         if update % self.pub_interval == 0 and self.should_push_model:
             self._model_pool_apis.push_model(
                 self.read_params(),
                 self.task.hyperparam,
                 self.model_key,
                 learner_meta=self.read_opt_params())
         # logging stuff
         if update % self.log_interval == 0 or update == 1:
             lossvals = np.mean(mblossvals, axis=0)
             mblossvals = []
             tnow = time.time()
             consuming_fps = int(nbatch * min(update, self.log_interval) /
                                 (tnow - tstart))
             time_elapsed = tnow - tfirststart
             total_samples_now = self._data_server.unroll_num * self.unroll_length
             receiving_fps = (total_samples_now - total_samples) / (tnow -
                                                                    tstart)
             total_samples = total_samples_now
             tstart = time.time()
             # 'scope_name/var' style for grouping Tab in Tensorboard webpage
             # lp is short for Learning Period
             scope = 'lp{}/'.format(self._lrn_period_count)
             logger.logkvs({
                 scope + "lrn_period_count":
                 self._lrn_period_count,
                 scope + "burn_in_value":
                 update <= nupdates_burn_in,
                 scope + "nupdates":
                 update,
                 scope + "total_timesteps":
                 update * nbatch,
                 scope + "all_consuming_fps":
                 consuming_fps,
                 scope + 'time_elapsed':
                 time_elapsed,
                 scope + "total_samples":
                 total_samples,
                 scope + "receiving_fps":
                 receiving_fps,
                 scope + "aband_samples":
                 (self._data_server.aband_unroll_num * self.unroll_length)
             })
             logger.logkvs({
                 scope + lossname: lossval
                 for lossname, lossval in zip(self.loss_names, lossvals)
             })
             logger.dumpkvs()
         if self.save_interval and (update % self.save_interval == 0
                                    or update == 1) and logger.get_dir():
             checkdir = osp.join(logger.get_dir(), 'checkpoints')
             os.makedirs(checkdir, exist_ok=True)
             savepath = osp.join(checkdir, '%.5i' % update)
             logger.log('Saving log to', savepath)
             self.save(savepath)
     if self.should_push_model:
         self._model_pool_apis.push_model(
             self.read_params(),
             self.task.hyperparam,
             self.model_key,
             learner_meta=self.read_opt_params())
     return
    def run(self):
        if self.should_push_model:
            self._model_pool_apis.push_model(
                self.read_params(),
                None,
                self.model_key,
                learner_meta=self.read_opt_params())
        self.tstart = time.time()
        self.tfirststart = self.tstart
        self.total_samples = self._total_samples()
        train_fetches_list, elapsed_time = [], 0
        for i in range(self._num_sgd_updates):
            # checkpoint stuff (saving, validation, etc.)
            if i % self._checkpoint_interval == 0:
                if self._checkpoints_dir is not None:
                    self._saver._save_model_checkpoint(self._checkpoints_dir,
                                                       "checkpoint_%s" % i)
                if self._enable_validation:
                    # TODO(pengsun): completely disable validation when not using
                    while not self.data_pool.ready_for_val:
                        time.sleep(5)
                    if self.use_hvd:
                        self.barrier()  # synchronize across all hvd learners
                    # do validation and logging
                    t = time.time()
                    val_endpoints = self._validate()
                    if self.rank == 0:
                        with logger.scoped_configure(logger=self.val_logger):
                            logger.logkvs({
                                "n_update": i,
                                "Elapsed Time": time.time() - t
                            })
                            logger.logkvs(
                                dict(zip(self._val_log_names, val_endpoints)))
                            logger.dumpkvs()

                while not self.data_pool.ready_for_train:
                    time.sleep(5)
                if self.use_hvd:
                    self.barrier()  # synchronize across all hvd learners
            # publish stuff (publish NN model)
            if i % self.pub_interval == 0 and self.should_push_model:
                self._model_pool_apis.push_model(
                    self.read_params(),
                    None,
                    self.model_key,
                    learner_meta=self.read_opt_params())
            # train one step and logging
            train_fetches = self._train_step()
            train_fetches_list.append(train_fetches)
            if len(train_fetches_list) >= self._print_interval:
                if self.rank == 0:
                    train_averaged_fetches = _reduce_mean_axis_zero(
                        train_fetches_list)
                    logger.logkvs({
                        "n_update": i,
                    })
                    self._update_timing_logkvs(n_batches=self._print_interval)
                    logger.logkvs({
                        name: item
                        for name, item in zip(self._train_log_names,
                                              train_averaged_fetches)
                    })
                    logger.dumpkvs()
                train_fetches_list = []