Exemple #1
0
    def _run_env_eval(self, step, do_sampler_step=True, calculate_holdout=True):
        timeit.start('eval')

        ### calculate holdout costs
        self._policy.eval_holdout()

        timeit.stop('eval')
Exemple #2
0
    def _run_env_eval(self,
                      step,
                      do_sampler_step=True,
                      calculate_holdout=True):
        timeit.start('eval')

        ### add to eval buffer
        if self._sampler_eval and do_sampler_step:
            self._sampler_eval.reset()

            eval_step = step
            num_dones = 0
            while num_dones < self._rollouts_per_eval:
                _, _, _, _, done, _ = \
                    self._sampler_eval.step(eval_step, explore=False)
                eval_step += 1
                num_dones += int(done)

            self._sampler.reset()

        ### calculate holdout costs
        if self._replay_pool_eval.can_sample(
                batch_size=self._batch_size) and calculate_holdout:
            indices, weights, steps, observations, goals, actions, rewards, dones, _ = \
                self._replay_pool_eval.sample(self._batch_size)
            self._policy.eval_holdout(step,
                                      steps=steps,
                                      observations=observations,
                                      goals=goals,
                                      actions=actions,
                                      rewards=rewards,
                                      dones=dones)

        timeit.stop('eval')
    def _run_env_step(self, step):
        timeit.start('sample')
        self._sampler.step(
            step,
            take_random_actions=(step < self._onpolicy_after_n_steps),
            explore=True)
        timeit.stop('sample')

        return step
Exemple #4
0
    def _run_init(self):
        self._restore()
        # note this is the right step, but the trajectories might not all have been saved
        save_itr = self._fm.get_train_itr()
        start_step = save_itr * self._save_every_n_steps

        self._sampler.reset()

        timeit.reset()
        timeit.start('total')

        return start_step, save_itr
Exemple #5
0
    def _run_init_inference(self):
        inference_itr = self._fm.get_inference_itr()
        self._restore_rollouts('train')
        self._restore_rollouts('eval')

        save_itr = inference_itr
        start_step = save_itr * self._save_every_n_steps

        timeit.reset()
        timeit.start('total')

        return start_step, save_itr
Exemple #6
0
    def _run_init_train(self):
        train_itr = self._fm.get_train_itr()
        if train_itr > 0:
            logger.info('Restore train iteration {0}'.format(train_itr - 1))
            self._policy.restore(self._fm.train_policy_fname(train_itr - 1), train=True)

        save_itr = train_itr
        start_step = save_itr * self._save_every_n_steps

        timeit.reset()
        timeit.start('total')

        return start_step, save_itr
Exemple #7
0
    def _run_init_train(self):
        train_itr = self._fm.get_train_itr()
        if train_itr > 0:
            self._policy.restore(self._fm.train_policy_fname(train_itr - 1),
                                 train=True)

        save_itr = train_itr
        start_step = save_itr * self._save_every_n_steps

        timeit.reset()
        timeit.start('total')

        return start_step, save_itr
Exemple #8
0
    def train(self):
        self._start_train_batch()

        logger.info('Training model')

        alg_args = self._params['alg']
        total_steps = int(alg_args['total_steps'])
        save_every_n_steps = int(alg_args['save_every_n_steps'])
        update_target_after_n_steps = int(
            alg_args['update_target_after_n_steps'])
        update_target_every_n_steps = int(
            alg_args['update_target_every_n_steps'])
        log_every_n_steps = int(alg_args['log_every_n_steps'])

        timeit.reset()
        timeit.start('total')
        save_itr = 0
        for step in range(total_steps):
            timeit.start('sample')
            # steps, observations, actions, rewards, dones, _ = self._replay_pool.sample(batch_size)
            steps, observations, actions, rewards, dones, _ = self._batch_queue.get(
            )
            timeit.stop('sample')
            timeit.start('train')
            self._model.train_step(step,
                                   steps=steps,
                                   observations=observations,
                                   actions=actions,
                                   rewards=rewards,
                                   dones=dones,
                                   use_target=True)
            timeit.stop('train')

            ### update target network
            if step > update_target_after_n_steps and step % update_target_every_n_steps == 0:
                self._model.update_target()

            ### log
            if step > 0 and step % log_every_n_steps == 0:
                logger.record_tabular('Step', step)
                self._model.log()
                logger.dump_tabular(print_func=logger.info)

                timeit.stop('total')
                for line in str(timeit).split('\n'):
                    logger.debug(line)
                timeit.reset()
                timeit.start('total')

            ### save model
            if step > 0 and step % save_every_n_steps == 0:
                logger.info('Saving files for itr {0}'.format(save_itr))
                self._save_train_policy(save_itr)
                save_itr += 1

        ### always save the end
        self._save_train_policy(save_itr)

        self._stop_train_batch()
Exemple #9
0
 def _run_log(self, step):
     logger.record_tabular('Step', step)
     self._env.log()
     self._replay_pool.log()
     if self._env_eval:
         self._env_eval.log(prefix='Eval')
     if self._replay_pool_eval:
         self._replay_pool_eval.log(prefix='Eval')
     self._policy.log()
     logger.dump_tabular(print_func=logger.info)
     timeit.stop('total')
     for line in str(timeit).split('\n'):
         logger.debug(line)
     timeit.reset()
     timeit.start('total')
Exemple #10
0
 def do_train_step():
     timeit.start('batch')
     indices, weights, steps, observations, goals, actions, rewards, dones, _ = \
         self._replay_pool.sample(self._batch_size)
     timeit.stop('batch')
     timeit.start('train')
     rew_errors = self._policy.train_step(step,
                                          steps=steps,
                                          observations=observations,
                                          goals=goals,
                                          actions=actions,
                                          rewards=rewards,
                                          dones=dones,
                                          weights=weights)
     self._replay_pool.update_priorities(indices, rew_errors)
     timeit.stop('train')
Exemple #11
0
    def train(self):
        ### restore where we left off
        save_itr = self._restore()

        target_updated = False
        eval_rollouts = []

        self._sampler.reset()
        if self._eval_sampler is not None:
            self._eval_sampler.reset()

        timeit.reset()
        timeit.start('total')
        for step in range(0, self._total_steps, self._sampler.n_envs):
            ### sample and add to buffer
            if step > self._sample_after_n_steps:
                timeit.start('sample')
                self._sampler.step(
                    step,
                    take_random_actions=(step <= self._onpolicy_after_n_steps),
                    explore=True)
                timeit.stop('sample')

            ### sample and DON'T add to buffer (for validation)
            if self._eval_sampler is not None and step > 0 and step % self._eval_every_n_steps == 0:
                timeit.start('eval')
                for _ in range(self._rollouts_per_eval):
                    eval_rollouts_step = []
                    eval_step = step
                    while len(eval_rollouts_step) == 0:
                        self._eval_sampler.step(eval_step, explore=False)
                        eval_rollouts_step = self._eval_sampler.get_recent_paths(
                        )
                        eval_step += 1
                    eval_rollouts += eval_rollouts_step
                timeit.stop('eval')

            if step >= self._learn_after_n_steps:
                ### training step
                if self._train_every_n_steps >= 1:
                    if step % int(self._train_every_n_steps) == 0:
                        timeit.start('batch')
                        steps, observations, goals, actions, rewards, dones, _ = \
                            self._sampler.sample(self._batch_size)
                        timeit.stop('batch')
                        timeit.start('train')
                        self._policy.train_step(step,
                                                steps=steps,
                                                observations=observations,
                                                goals=goals,
                                                actions=actions,
                                                rewards=rewards,
                                                dones=dones,
                                                use_target=target_updated)
                        timeit.stop('train')
                else:
                    for _ in range(int(1. / self._train_every_n_steps)):
                        timeit.start('batch')
                        steps, observations, goals, actions, rewards, dones, _ = \
                            self._sampler.sample(self._batch_size)
                        timeit.stop('batch')
                        timeit.start('train')
                        self._policy.train_step(step,
                                                steps=steps,
                                                observations=observations,
                                                goals=goals,
                                                actions=actions,
                                                rewards=rewards,
                                                dones=dones,
                                                use_target=target_updated)
                        timeit.stop('train')

                ### update target network
                if step > self._update_target_after_n_steps and step % self._update_target_every_n_steps == 0:
                    self._policy.update_target()
                    target_updated = True

                ### log
                if step % self._log_every_n_steps == 0:
                    logger.record_tabular('Step', step)
                    self._sampler.log()
                    self._eval_sampler.log(prefix='Eval')
                    self._policy.log()
                    logger.dump_tabular(print_func=logger.info)
                    timeit.stop('total')
                    for line in str(timeit).split('\n'):
                        logger.debug(line)
                    timeit.reset()
                    timeit.start('total')

            ### save model
            if step > 0 and step % self._save_every_n_steps == 0:
                logger.info('Saving files for itr {0}'.format(save_itr))
                self._save(save_itr, self._sampler.get_recent_paths(),
                           eval_rollouts)
                save_itr += 1
                eval_rollouts = []

        self._save(save_itr, self._sampler.get_recent_paths(), eval_rollouts)
Exemple #12
0
    def inference(self):
        ### restore where we left off
        self._restore_inference()
        inference_itr = self._get_inference_itr()
        inference_step = self._get_inference_step()
        train_itr = self._get_train_itr()

        self._run_rsync()

        train_rollouts = []
        eval_rollouts = []

        self._inference_reset_sampler()

        timeit.reset()
        timeit.start('total')
        while True:
            train_step = self._get_train_step()
            if inference_step > self._total_steps:
                break

            ### sample and add to buffer
            if inference_step > self._sample_after_n_steps:
                timeit.start('sample')
                inference_step = self._inference_step(inference_step)
                timeit.stop('sample')
            else:
                inference_step += self._sampler.n_envs

            ### sample and DON'T add to buffer (for validation)
            if self._eval_sampler is not None and inference_step > 0 and inference_step % self._eval_every_n_steps == 0:
                timeit.start('eval')
                eval_rollouts_step = []
                eval_step = inference_step
                while len(eval_rollouts_step) == 0:
                    self._eval_sampler.step(eval_step, explore=False)
                    eval_rollouts_step = self._eval_sampler.get_recent_paths()
                    eval_step += 1
                eval_rollouts += eval_rollouts_step
                timeit.stop('eval')

            ### log
            if inference_step % self._log_every_n_steps == 0:
                logger.info('train itr {0:04d} inference itr {1:04d}'.format(
                    train_itr, inference_itr))
                logger.record_tabular('Train step', train_step)
                logger.record_tabular('Inference step', inference_step)
                self._sampler.log()
                if self._eval_sampler:
                    self._eval_sampler.log(prefix='Eval')
                logger.dump_tabular(print_func=logger.info)
                timeit.stop('total')
                for line in str(timeit).split('\n'):
                    logger.debug(line)
                timeit.reset()
                timeit.start('total')

            ### save rollouts / load model
            train_rollouts += self._sampler.get_recent_paths()
            if inference_step > 0 and inference_step % self._inference_save_every_n_steps == 0:
                self._inference_reset_sampler()

                ### save rollouts
                logger.debug('Saving files for itr {0}'.format(inference_itr))
                self._save_inference(inference_itr, train_rollouts,
                                     eval_rollouts)
                inference_itr += 1
                train_rollouts = []
                eval_rollouts = []

                ### load model
                with self._rsync_lock:  # to ensure the ckpt has been fully transferred over
                    new_train_itr = self._get_train_itr()
                    if train_itr < new_train_itr:
                        logger.debug(
                            'Loading policy for itr {0}'.format(new_train_itr -
                                                                1))
                        try:
                            self._policy.restore(
                                self._inference_policy_file_name(
                                    new_train_itr - 1),
                                train=False)
                            train_itr = new_train_itr
                        except:
                            logger.debug(
                                'Failed to load model for itr {0}'.format(
                                    new_train_itr - 1))
                            self._policy.restore(
                                self._inference_policy_file_name(train_itr -
                                                                 1),
                                train=False)
                            logger.debug('As backup, restored itr {0}'.format(
                                train_itr - 1))

        self._save_inference(inference_itr, self._sampler.get_recent_paths(),
                             eval_rollouts)
Exemple #13
0
    def train(self):
        ### restore where we left off
        init_inference_step = len(self._sampler)  # don't count offpolicy
        self._restore_train()
        train_itr = self._get_train_itr()
        train_step = self._get_train_step()
        inference_itr = self._get_inference_itr()

        target_updated = False

        timeit.reset()
        timeit.start('total')
        while True:
            inference_step = len(self._sampler) - init_inference_step
            if inference_step > self._total_steps or train_step > self._train_total_steps:
                break

            if inference_step >= self._learn_after_n_steps:
                ### training step
                train_step += 1
                timeit.start('batch')
                steps, observations, goals, actions, rewards, dones, _ = \
                    self._sampler.sample(self._batch_size)
                timeit.stop('batch')
                timeit.start('train')
                self._policy.train_step(train_step,
                                        steps=steps,
                                        observations=observations,
                                        goals=goals,
                                        actions=actions,
                                        rewards=rewards,
                                        dones=dones,
                                        use_target=target_updated)
                timeit.stop('train')

                ### update target network
                if train_step > self._update_target_after_n_steps and train_step % self._update_target_every_n_steps == 0:
                    self._policy.update_target()
                    target_updated = True

                ### log
                if train_step % self._log_every_n_steps == 0:
                    logger.info(
                        'train itr {0:04d} inference itr {1:04d}'.format(
                            train_itr, inference_itr))
                    logger.record_tabular('Train step', train_step)
                    logger.record_tabular('Inference step', inference_step)
                    self._policy.log()
                    logger.dump_tabular(print_func=logger.info)
                    timeit.stop('total')
                    for line in str(timeit).split('\n'):
                        logger.debug(line)
                    timeit.reset()
                    timeit.start('total')
            else:
                time.sleep(1)

            ### save model
            if train_step > 0 and train_step % self._train_save_every_n_steps == 0:
                logger.debug('Saving files for itr {0}'.format(train_itr))
                self._save_train(train_itr)
                train_itr += 1

            ### reset model
            if train_step > 0 and self._train_reset_every_n_steps is not None and \
                                    train_step % self._train_reset_every_n_steps == 0:
                logger.debug('Resetting model')
                self._policy.reset_weights()

            ### load data
            inference_itr = self._train_load_data(inference_itr)
Exemple #14
0
    def inference(self):
        ### restore where we left off
        self._restore_inference()
        inference_itr = self._get_inference_itr()
        inference_step = self._get_inference_step()
        train_itr = self._get_train_itr()

        self._run_rsync()

        assert (self._eval_sampler is None)  # TODO: temporary
        train_rollouts = []
        eval_rollouts = []

        self._reset_sampler()

        timeit.reset()
        timeit.start('total')
        while True:
            train_step = self._get_train_step()
            if inference_step > self._total_steps:
                break

            ### sample and add to buffer
            if inference_step > self._sample_after_n_steps:
                timeit.start('sample')
                try:
                    self._sampler.step(
                        inference_step,
                        take_random_actions=(
                            inference_step <= self._learn_after_n_steps
                            or inference_step <= self._onpolicy_after_n_steps),
                        explore=True)
                    inference_step += self._sampler.n_envs
                except Exception as e:
                    logger.warn('Sampler exception {0}'.format(str(e)))
                    trashed_steps = self._sampler.trash_current_rollouts()
                    inference_step -= trashed_steps
                    logger.warn('Trashed {0} steps'.format(trashed_steps))
                    while not self._env.ros_is_good(
                            print=False):  # TODO hard coded
                        time.sleep(0.25)
                    self._reset_sampler()
                    logger.warn('Continuing...')
                timeit.stop('sample')
            else:
                inference_step += self._sampler.n_envs

            ### sample and DON'T add to buffer (for validation)
            if self._eval_sampler is not None and inference_step > 0 and inference_step % self._eval_every_n_steps == 0:
                timeit.start('eval')
                eval_rollouts_step = []
                eval_step = inference_step
                while len(eval_rollouts_step) == 0:
                    self._eval_sampler.step(eval_step, explore=False)
                    eval_rollouts_step = self._eval_sampler.get_recent_paths()
                    eval_step += 1
                eval_rollouts += eval_rollouts_step
                timeit.stop('eval')

            ### log
            if inference_step % self._log_every_n_steps == 0:
                logger.info('train itr {0:04d} inference itr {1:04d}'.format(
                    train_itr, inference_itr))
                logger.record_tabular('Train step', train_step)
                logger.record_tabular('Inference step', inference_step)
                self._sampler.log()
                if self._eval_sampler:
                    self._eval_sampler.log(prefix='Eval')
                logger.dump_tabular(print_func=logger.info)
                timeit.stop('total')
                for line in str(timeit).split('\n'):
                    logger.debug(line)
                timeit.reset()
                timeit.start('total')

            ### save rollouts / load model
            train_rollouts += self._sampler.get_recent_paths()
            if inference_step > 0 and inference_step % self._inference_save_every_n_steps == 0 and \
                            len(train_rollouts) > 0:
                response = input('Keep rollouts?')
                if response != 'y':
                    train_rollouts = []
                    continue

                ### reset to stop rollout
                self._sampler.reset()

                ### save rollouts
                logger.debug('Saving files for itr {0}'.format(inference_itr))
                self._save_inference(inference_itr, train_rollouts,
                                     eval_rollouts)
                inference_itr += 1
                train_rollouts = []
                eval_rollouts = []

                ### load model
                with self._rsync_lock:  # to ensure the ckpt has been fully transferred over
                    new_train_itr = self._get_train_itr()
                    if train_itr < new_train_itr:
                        logger.debug(
                            'Loading policy for itr {0}'.format(new_train_itr -
                                                                1))
                        try:
                            self._policy.restore(
                                self._inference_policy_file_name(
                                    new_train_itr - 1),
                                train=False)
                            train_itr = new_train_itr
                        except:
                            logger.debug(
                                'Failed to load model for itr {0}'.format(
                                    new_train_itr - 1))
                            self._policy.restore(
                                self._inference_policy_file_name(train_itr -
                                                                 1),
                                train=False)
                            logger.debug('As backup, restored itr {0}'.format(
                                train_itr - 1))

        self._save_inference(inference_itr, self._sampler.get_recent_paths(),
                             eval_rollouts)
Exemple #15
0
 def do_train_step():
     timeit.start('train')
     self._policy.train_step(step)
     timeit.stop('train')
Exemple #16
0
    def train(self):
        ### restore where we left off
        self._restore_train()
        train_itr = self._get_train_itr()
        train_step = self._get_train_step()
        inference_itr = self._get_inference_itr()
        init_inference_step = len(self._sampler)

        target_updated = False

        timeit.reset()
        timeit.start('total')
        while True:
            inference_step = len(self._sampler) - init_inference_step
            if inference_step > self._total_steps:
                break

            if inference_step >= self._learn_after_n_steps:
                ### update preprocess
                if train_step % self._update_preprocess_every_n_steps == 0:
                    self._policy.update_preprocess(self._sampler.statistics)

                ### training step
                train_step += 1
                timeit.start('batch')
                batch = self._sampler.sample(self._batch_size)
                timeit.stop('batch')
                timeit.start('train')
                self._policy.train_step(train_step,
                                        *batch,
                                        use_target=target_updated)
                timeit.stop('train')

                ### update target network
                if train_step > self._update_target_after_n_steps and train_step % self._update_target_every_n_steps == 0:
                    self._policy.update_target()
                    target_updated = True

                ### log
                if train_step % self._log_every_n_steps == 0:
                    logger.info(
                        'train itr {0:04d} inference itr {1:04d}'.format(
                            train_itr, inference_itr))
                    logger.record_tabular('Train step', train_step)
                    logger.record_tabular('Inference step', inference_step)
                    self._policy.log()
                    logger.dump_tabular(print_func=logger.info)
                    timeit.stop('total')
                    for line in str(timeit).split('\n'):
                        logger.debug(line)
                    timeit.reset()
                    timeit.start('total')
            else:
                time.sleep(1)

            ### save model
            if train_step > 0 and train_step % self._train_save_every_n_steps == 0:
                logger.debug('Saving files for itr {0}'.format(train_itr))
                self._save_train(train_itr)
                train_itr += 1

            ### reset model
            if train_step > 0 and self._train_reset_every_n_steps is not None and \
                                    train_step % self._train_reset_every_n_steps == 0:
                logger.debug('Resetting model')
                self._policy.reset_weights()

            ### load data
            new_inference_itr = self._get_inference_itr()
            if inference_itr < new_inference_itr:
                for i in range(inference_itr, new_inference_itr):
                    try:
                        logger.debug('Loading files for itr {0}'.format(i))
                        self._sampler.add_rollouts(
                            [self._train_rollouts_file_name(i)])
                        inference_itr = i + 1
                    except:
                        logger.debug(
                            'Failed to load files for itr {0}'.format(i))
Exemple #17
0
 def _run_save(self, save_itr):
     timeit.start('save')
     logger.info('Saving files for itr {0}'.format(save_itr))
     self._save(save_itr, self._replay_pool.get_recent_rollouts(),
                self._replay_pool_eval.get_recent_rollouts())
     timeit.stop('save')