コード例 #1
0
    def train_batch(self, start_epoch):
        self._current_path_builder = PathBuilder()
        for epoch in gt.timed_for(
                range(start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            self._start_epoch(epoch)
            set_to_train_mode(self.training_env)
            observation = self._start_new_rollout()
            # This implementation is rather naive. If you want to (e.g.)
            # parallelize data collection, this would be the place to do it.
            for _ in range(self.num_env_steps_per_epoch):
                observation = self._take_step_in_env(observation)
            gt.stamp('sample')

            self._try_to_train()
            gt.stamp('train')

            set_to_eval_mode(self.env)
            self._try_to_eval(epoch)
            gt.stamp('eval')

            self._try_to_fit(epoch)
            gt.stamp('env_fit')
            self.logger.record_tabular(self.env_loss_key, self.env_loss)

            self._end_epoch(epoch)
            self.logger.dump_tabular(with_prefix=False, with_timestamp=False)
コード例 #2
0
    def train_online(self, start_epoch=0):
        self._current_path_builder = PathBuilder()
        for epoch in gt.timed_for(
                range(start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            self._start_epoch(epoch)
            set_to_train_mode(self.training_env)
            observation = self._start_new_rollout()
            for _ in range(self.num_env_steps_per_epoch):
                observation = self._take_step_in_env(observation)
                gt.stamp('sample')

                self._try_to_fit(epoch)
                gt.stamp('env_fit')

                self._try_to_train()
                gt.stamp('train')

            self.logger.record_tabular(self.env_loss_key, self.env_loss)

            set_to_eval_mode(self.env)
            self._try_to_eval(epoch)
            gt.stamp('eval')
            self._end_epoch(epoch)

            self.logger.dump_tabular(with_prefix=False, with_timestamp=False)
コード例 #3
0
    def train(self):
        '''
        meta-training loop
        '''
        self.pretrain()
        params = self.get_epoch_snapshot(-1)
        logger.save_itr_params(-1, params)
        gt.reset()
        gt.set_def_unique(False)
        self._current_path_builder = PathBuilder()

        # at each iteration, we first collect data from tasks, perform meta-updates, then try to evaluate
        for it_ in gt.timed_for(
                range(self.num_iterations),
                save_itrs=True,
        ):
            self._start_epoch(it_)
            self.training_mode(True)
            if it_ == 0:
                print('collecting initial pool of data for train and eval')
                # temp for evaluating
                for idx in self.train_tasks:
                    self.task_idx = idx
                    self.env.reset_task(idx)
                    self.collect_data(self.num_initial_steps, 1, np.inf)
            # Sample data from train tasks.
            for i in range(self.num_tasks_sample):
                idx = np.random.randint(len(self.train_tasks))
                self.task_idx = idx
                self.env.reset_task(idx)
                self.enc_replay_buffer.task_buffers[idx].clear()

                # collect some trajectories with z ~ prior
                if self.num_steps_prior > 0:
                    self.collect_data(self.num_steps_prior, 1, np.inf)
                # collect some trajectories with z ~ posterior
                if self.num_steps_posterior > 0:
                    self.collect_data(self.num_steps_posterior, 1,
                                      self.update_post_train)
                # even if encoder is trained only on samples from the prior, the policy needs to learn to handle z ~ posterior
                if self.num_extra_rl_steps_posterior > 0:
                    self.collect_data(self.num_extra_rl_steps_posterior,
                                      1,
                                      self.update_post_train,
                                      add_to_enc_buffer=False)

            # Sample train tasks and compute gradient updates on parameters.
            for train_step in range(self.num_train_steps_per_itr):
                indices = np.random.choice(self.train_tasks, self.meta_batch)
                self._do_training(indices)
                self._n_train_steps_total += 1
            gt.stamp('train')

            self.training_mode(False)

            # eval
            self._try_to_eval(it_)
            gt.stamp('eval')

            self._end_epoch()
コード例 #4
0
ファイル: rl_algorithm.py プロジェクト: simitii/rlkit
    def train_online(self, start_epoch=0):
        if not self.environment_farming:
            observation = self._start_new_rollout()
        self._current_path_builder = PathBuilder()
        for epoch in gt.timed_for(
                range(start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            self._start_epoch(epoch)

            for _ in range(self.num_env_steps_per_epoch):
                if not self.environment_farming:
                    observation = self.play_one_step(observation)
                else:
                    # acquire a remote environment
                    remote_env = self.farmer.force_acq_env()
                    self.play_ignore(remote_env)

                # Training out of threads
                self._try_to_train()
                gt.stamp('train')

            if epoch % 10 == 0:
                self._try_to_eval(epoch)
            gt.stamp('eval')
            self._end_epoch()
コード例 #5
0
    def train_online(self, start_epoch=0):
        self._current_path_builder = PathBuilder()

        observation = self._start_new_rollout()
        self.sample_z = self.sample_z_vec()
        observation = np.concatenate([observation, self.sample_z])

        for epoch in gt.timed_for(
                range(start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            self._start_epoch(epoch)
            set_to_train_mode(self.training_env)
            for t in range(self.num_env_steps_per_epoch):
                #print("step", t, "pool", self.replay_buffer.num_steps_can_sample())

                observation = self._take_step_in_env(observation)
                gt.stamp('sample')

                self._try_to_train()
                gt.stamp('train')

            set_to_eval_mode(self.env)
            self._try_to_eval(epoch)
            gt.stamp('eval')
            self._end_epoch(epoch)
コード例 #6
0
ファイル: mbmrl.py プロジェクト: NagisaZj/model-based-meta-rl
    def debug(self):
        gt.reset()
        gt.set_def_unique(False)

        for i in gt.timed_for(range(self.iteration_num), save_itrs=True):
            self._start_iteration(i)

            if i % self.task_sample_frequency == 0:
                self.logger.log('Data Collection')
                task = self._sample_task()
                rollouts = self._collect_traj(task, debug=True)
                self._n_rollouts_total += 1
                self.dataset.extend(rollouts)
            gt.stamp('sample')

            self.logger.log('Adaptation Update')

            for _ in range(self.adaptation_update_num):
                trajs = self._sample_traj(debug=True)
                self.theta_loss = self._compute_adaptation_loss(
                    self.theta, trajs)
                self._meta_update(self.theta_loss)

            gt.stamp('adaptation')
            gt.stamp('meta')

            if i % self.eval_frequency == 0:
                self.logger.log('Evaluation')
                self.evaluate()
            gt.stamp('eval')

            self._end_iteration(i)
コード例 #7
0
    def train_online(self, start_epoch=0):
        # No need for training mode to be True when generating trajectories
        # training mode is automatically set to True
        # in _try_to_train and before exiting
        # it that function it reverts it to False
        self.training_mode(False)
        self._current_path_builder = PathBuilder()
        self._n_rollouts_total = 0

        for epoch in gt.timed_for(
                range(start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            self._start_epoch(epoch)
            print('EPOCH STARTED')
            # print('epoch')
            for _ in range(self.num_rollouts_per_epoch):
                # print('rollout')
                task_params, obs_task_params = self.train_task_params_sampler.sample(
                )
                self.generate_exploration_rollout(
                    task_params=task_params, obs_task_params=obs_task_params)

                # print(self._n_rollouts_total)
                if self._n_rollouts_total % self.num_rollouts_between_updates == 0:
                    gt.stamp('sample')
                    # print('train')
                    if not self.do_not_train: self._try_to_train(epoch)
                    gt.stamp('train')

            if not self.do_not_eval:
                self._try_to_eval(epoch)
                gt.stamp('eval')

            self._end_epoch()
コード例 #8
0
ファイル: rl_algorithm.py プロジェクト: jcoreyes/rlkit
    def train_batch(self, start_epoch):
        self._current_path_builder = PathBuilder()

        for epoch in gt.timed_for(
                range(start_epoch, self.num_epochs),
                save_itrs=True,
        ):

            self._start_epoch(epoch)
            set_to_train_mode(self.training_env)
            observation = self._start_new_rollout()
            # This implementation is rather naive. If you want to (e.g.)
            # parallelize data collection, this would be the place to do it.
            for i in range(self.num_env_steps_per_epoch):
                observation, terminal = self._take_step_in_env(observation)

                #print(i, terminal)
            assert terminal[0] == True
            gt.stamp('sample')

            self._try_to_train()
            gt.stamp('train')

            set_to_eval_mode(self.env)
            #print(i, terminal)
            self._try_to_eval(epoch)
            gt.stamp('eval')
            self._end_epoch(epoch)
コード例 #9
0
ファイル: trainer.py プロジェクト: npitsillos/deepRLalgos
    def _train(self):

        for epoch in gt.timed_for(
                range(self._start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            self.eval_path_collector.collect_new_paths(
                self.max_path_length,
                self.num_eval_steps_per_epoch,
                discard_incomplete_paths=True,
            )
            gt.stamp('evaluation sampling')

            new_expl_paths = self.expl_path_collector.collect_new_paths(
                self.max_path_length,
                self.num_expl_steps_per_train_loop,
                discard_incomplete_paths=False,
            )
            gt.stamp('exploration sampling', unique=False)
            self.replay_buffer.add_paths(new_expl_paths)
            gt.stamp('data storing', unique=False)

            self.training_mode(True)
            train_data = self.replay_buffer.random_batch(self.batch_size)
            self.algo.train(train_data)
            gt.stamp('training', unique=False)
            self.training_mode(False)

            self._end_epoch(epoch)
コード例 #10
0
def experiment(variant):
    cuda = True
    from gym.envs.mujoco import HalfCheetahEnv
    from mujoco_torch.core.bridge import MjCudaRender
    R = 84
    env = HalfCheetahEnv()
    c = Convnet(6, output_activation=torch.tanh, input_channels=3)
    if cuda:
        c.cuda()

    gt.stamp("start")
    for i in range(100):
        img = env.sim.render(R, R, device_id=1)

    gt.stamp("warmstart")
    for i in gt.timed_for(range(1000)):
        env.step(np.random.rand(6))
        gt.stamp('step')

        img = env.sim.render(R, R, device_id=1)
        gt.stamp('render')

        x = np_to_var(img)
        if cuda:
            x = x.cuda()
            torch.cuda.synchronize()
        gt.stamp('transfer')
        # cv2.imshow("img", img)
        # cv2.waitKey(1)
    gt.stamp("end")

    print(img)

    print(gt.report(include_itrs=False))
コード例 #11
0
    def _train(self, env, policy, pool):
        """Perform RL training.

        Args:
            env (`rllab.Env`): Environment used for training
            policy (`Policy`): Policy used for training
            pool (`PoolBase`): Sample pool to add samples to
        """
        self._init_training()
        self.sampler.initialize(env, policy, pool)

        evaluation_env = deep_clone(env) if self._eval_n_episodes else None
        # TODO: use Ezpickle to deep_clone???
        # evaluation_env = env

        with tf_utils.get_default_session().as_default():
            gt.rename_root('RLAlgorithm')
            gt.reset()
            gt.set_def_unique(False)

            for epoch in gt.timed_for(
                    range(self._n_epochs + 1), save_itrs=True):
                logger.push_prefix('Epoch #%d | ' % epoch)

                for t in range(self._epoch_length):
                    self.sampler.sample()
                    if not self.sampler.batch_ready():
                        continue
                    gt.stamp('sample')

                    for i in range(self._n_train_repeat):
                        self._do_training(
                            iteration=t + epoch * self._epoch_length,
                            batch=self.sampler.random_batch())
                    gt.stamp('train')

                self._evaluate(policy, evaluation_env)
                gt.stamp('eval')

                params = self.get_snapshot(epoch)
                logger.save_itr_params(epoch, params)

                time_itrs = gt.get_times().stamps.itrs
                time_eval = time_itrs['eval'][-1]
                time_total = gt.get_times().total
                time_train = time_itrs.get('train', [0])[-1]
                time_sample = time_itrs.get('sample', [0])[-1]

                logger.record_tabular('time-train', time_train)
                logger.record_tabular('time-eval', time_eval)
                logger.record_tabular('time-sample', time_sample)
                logger.record_tabular('time-total', time_total)
                logger.record_tabular('epoch', epoch)

                self.sampler.log_diagnostics()

                logger.dump_tabular(with_prefix=False)
                logger.pop_prefix()

            self.sampler.terminate()
コード例 #12
0
    def train(self, start_epoch=0):
        # Get snapshot of initial algo state
        if start_epoch == 0:
            self._log_initial_data()

        self.training_mode(False)
        self._n_env_steps_total = start_epoch * self.num_train_steps_per_epoch

        gt.reset()
        gt.set_def_unique(False)

        self._current_path = PathBuilder()
        for epoch in gt.timed_for(
                range(start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            self._start_epoch(epoch)
            obs = self._start_new_rollout()
            for ss in range(self.num_train_steps_per_epoch):
                obs = self._take_step_in_env(obs)
                gt.stamp('sample')

                if self._algo_mode == 'online':
                    self._try_to_train()
                    gt.stamp('train')

            if self._algo_mode == 'episode':
                self._try_to_train()
                gt.stamp('train')

            self._try_to_eval(epoch)
            gt.stamp('eval')
            self._end_epoch(epoch)
コード例 #13
0
    def _train(self):
        self.training_mode(False)

        for epoch in gt.timed_for(range(self._start_epoch, self.num_epochs),
                                  save_itrs=True):
            print(
                f"in train, with eval to go: {self.num_eval_steps_per_epoch}")
            for step in range(self.num_eval_steps_per_epoch):

                self.eval_data_collector.collect_one_step(
                    step, self.num_eval_steps_per_epoch)
            gt.stamp("evaluation sampling")
            print("done with eval")

            for _ in range(self.num_train_loops_per_epoch):
                # this if check could be moved inside the function
                if self.use_linear_lr_decay:
                    # decrease learning rate linearly
                    self.trainer.decay_lr(epoch, self.num_epochs)

                for step in range(self.num_expl_steps_per_train_loop):
                    self.expl_data_collector.collect_one_step(
                        step, self.num_expl_steps_per_train_loop)
                    # time.sleep(1)

                gt.stamp("exploration sampling", unique=False)

                rollouts = self.expl_data_collector.get_rollouts()
                gt.stamp("data storing", unique=False)
                self.training_mode(True)
                self.trainer.train(rollouts)
                gt.stamp("training", unique=False)
                self.training_mode(False)

            self._end_epoch(epoch)
コード例 #14
0
    def train_online(self, start_epoch=0):
        # No need for training mode to be True when generating trajectories
        # training mode is automatically set to True
        # in _try_to_train and before exiting
        # it that function it reverts it to False
        self.training_mode(False)
        self._current_path_builder = PathBuilder()

        for epoch in gt.timed_for(
                range(start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            self._start_epoch(epoch)
            for _ in range(self.num_rollouts_per_epoch):
                task_params, obs_task_params = self.train_task_params_sampler.sample(
                )
                self.generate_exploration_rollout(
                    task_params=task_params, obs_task_params=obs_task_params)

            # essentially in each epoch we gather data then do a certain amount of training
            gt.stamp('sample')
            if not self.do_not_train: self._try_to_train()
            gt.stamp('train')

            if epoch % self.freq_eval == 0:
                # and then we evaluate it
                if not self.do_not_eval: self._try_to_eval(epoch)
                gt.stamp('eval')

            self._end_epoch()
コード例 #15
0
    def start_training(self, start_epoch=0):
        for epoch in gt.timed_for(
                range(start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            self._start_epoch(epoch)
            steps_this_epoch = 0
            steps_since_train_call = 0
            while steps_this_epoch < self.min_steps_per_epoch:
                task_params = self.train_task_params_sampler.sample()
                rollout_len = self.do_task_rollout(task_params)

                steps_this_epoch += rollout_len
                steps_since_train_call += rollout_len

                if steps_since_train_call > self.min_steps_between_train_calls:
                    steps_since_train_call = 0
                    gt.stamp('sample')
                    self._try_to_train(epoch)
                    gt.stamp('train')

            gt.stamp('sample')
            self._try_to_eval(epoch)
            gt.stamp('eval')
            self._end_epoch()
コード例 #16
0
    def _train(self):
        # Pretrain the model at the beginning of training until convergence
        # Note that convergence is measured against a holdout set of max size 8192
        if self.train_at_start:
            self.model_trainer.train_from_buffer(
                self.replay_buffer,
                max_grad_steps=self.model_max_grad_steps,
                epochs_since_last_update=self.model_epochs_since_last_update,
            )
        gt.stamp('model training', unique=False)

        for epoch in gt.timed_for(
            range(self._start_epoch, self.num_epochs),
            save_itrs=True,
        ):
            self.eval_data_collector.collect_new_paths(
                self.max_path_length,
                self.num_eval_steps_per_epoch,
                discard_incomplete_paths=True,
            )
            gt.stamp('evaluation sampling')

            self.training_mode(True)
            for _ in range(self.num_train_loops_per_epoch):
                for t in range(self.num_trains_per_train_loop):
                    train_data = self.replay_buffer.random_batch(self.batch_size)
                    self.trainer.train(train_data)
                    gt.stamp('policy training', unique=False)
            self.training_mode(False)

            self._end_epoch(epoch)
コード例 #17
0
    def _train(self):
        if self.min_num_steps_before_training > 0:
            init_expl_paths = self.expl_data_collector.collect_new_paths(
                self.max_path_length,
                self.min_num_steps_before_training,
            )
            self.replay_buffer.add_paths(init_expl_paths)
            self.expl_data_collector.end_epoch(-1)

        for epoch in gt.timed_for(
                range(self._start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            self.eval_data_collector.collect_new_paths(
                self.max_path_length,
                self.num_eval_steps_per_epoch,
            )
            gt.stamp('evaluation sampling')

            for _ in range(self.num_train_loops_per_epoch):
                new_expl_paths = self.expl_data_collector.collect_new_paths(
                    self.max_path_length,
                    self.num_expl_steps_per_train_loop,
                )
                gt.stamp('exploration sampling', unique=False)

                self.replay_buffer.add_paths(new_expl_paths)
                gt.stamp('data storing', unique=False)

                for _ in range(self.num_trains_per_train_loop):
                    train_data = self.replay_buffer.random_batch(self.batch_size)
                    self.trainer.train(train_data)
                gt.stamp('training', unique=False)

            self._end_epoch(epoch)
コード例 #18
0
    def train_online(self, start_epoch=0):
        self._current_path_builder = PathBuilder()
        if self.epoch_list is not None:
            iters = list(self.epoch_list)
        else:
            iters = list(range(start_epoch, self.num_epochs, self.epoch_freq))
        if self.num_epochs - 1 not in iters and self.num_epochs - 1 > iters[-1]:
            iters.append(self.num_epochs - 1)
        for epoch in gt.timed_for(
                iters,
                save_itrs=True,
        ):
            self._start_epoch(epoch)
            env_utils.mode(self.training_env, 'train')
            observation = self._start_new_rollout()
            for _ in range(self.num_env_steps_per_epoch):
                if self.do_training:
                    observation = self._take_step_in_env(observation)

                gt.stamp('sample')
                self._try_to_train()
                gt.stamp('train')
            env_utils.mode(self.env, 'eval')
            # TODO steven: move dump_tabular to be conditionally called in
            # end_epoch and move post_epoch after eval
            self._post_epoch(epoch)
            self._try_to_eval(epoch)
            gt.stamp('eval')
            self._end_epoch()
コード例 #19
0
    def train(self, start_epoch=0):

        batch_idxes = np.arange(self.num_tasks)
        batch_idxes = np.concatenate([batch_idxes[self.train_goal_id:], batch_idxes[:self.train_goal_id]])

        for epoch in gt.timed_for(
                trange(start_epoch, self.num_epochs),
                save_itrs=True,
        ):

            # Sample meta training tasks. And transfer the
            # transitions sampling job to each remote replay buffer.

            train_batch_obj_id = self.train_buffer.sample_training_data(batch_idxes)

            for _ in range(self.num_train_loops_per_epoch):
                train_raw_batch = ray.get(train_batch_obj_id)
                gt.stamp('sample_training_data', unique=False)

                # In this way, we can start the data sampling job for the
                # next training while doing training for the current loop.
                train_batch_obj_id = self.train_buffer.sample_training_data(batch_idxes)
                gt.stamp('set_up_sampling', unique=False)

                train_data = self.construct_training_batch(train_raw_batch)
                gt.stamp('construct_training_batch', unique=False)
                
                self.trainer.train(train_data, batch_idxes, epoch)
                
            gt.stamp('training', unique=False)

            self._end_epoch(epoch)
コード例 #20
0
    def _train(self):
        """Called by superclass BaseRLAlgorithm, conducts the training loop.

        Before training (i.e., the minimum number of steps before trainnig) Get
        new paths for _exploration_, with noise added (in the case of DDPG).
        Add the paths to replay buffer.

        Then we begin the actual cycle of evaluation and exploration. Each
        epoch consists of an evaluator data collector collecting paths
        (discarding incomplete ones), and then exploration data collection, and
        only exploration data is added to the buffer. The number of training
        loops is 1 by default so usually it will be one cycle of (evaluate,
        explore). Each explore, though, will do a bunch of training loops,
        e.g., 1000 by default.

        When we talk about 'steps' we really should be talking about training
        (or exploration) steps, right? The evaluation steps is for reporting
        results.
        """
        if self.min_num_steps_before_training > 0:
            init_expl_paths = self.expl_data_collector.collect_new_paths(
                self.max_path_length,
                self.min_num_steps_before_training,
                discard_incomplete_paths=False,
            )
            self.replay_buffer.add_paths(init_expl_paths)
            self.expl_data_collector.end_epoch(-1)

        for epoch in gt.timed_for(
                range(self._start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            self.eval_data_collector.collect_new_paths(
                self.max_path_length,
                self.num_eval_steps_per_epoch,
                discard_incomplete_paths=True,
            )
            gt.stamp('evaluation sampling')

            for _ in range(self.num_train_loops_per_epoch):
                new_expl_paths = self.expl_data_collector.collect_new_paths(
                    self.max_path_length,
                    self.num_expl_steps_per_train_loop,
                    discard_incomplete_paths=False,
                )
                gt.stamp('exploration sampling', unique=False)

                self.replay_buffer.add_paths(new_expl_paths)
                gt.stamp('data storing', unique=False)

                self.training_mode(True)
                for _ in range(self.num_trains_per_train_loop):
                    train_data = self.replay_buffer.random_batch(
                        self.batch_size)
                    self.trainer.train(train_data)
                gt.stamp('training', unique=False)
                self.training_mode(False)

            self._end_epoch(epoch)
コード例 #21
0
def experiment(variant):

    root = 0

    E = 20
    R = 84
    U = 6
    cuda = True

    envs = []

    for e in range(E):
        env = HalfCheetahEnv()
        envs.append(env)

    c = Convnet(6, output_activation=torch.tanh, input_channels=3)
    if cuda:
        c.cuda()

    # viewer = mujoco_py.MjRenderContextOffscreen(env.sim, device_id=1)
    # env.sim.add_render_context(viewer)

    def step(i, stamp=True):
        imgs = []
        if i % 100 == 0:
            for e in envs:
                e.reset()
        for e in envs:
            img = e.sim.render(R, R, device_id=0).transpose()
            imgs.append(img)
        gt.stamp('render') if stamp else 0

        imgs = np.array(imgs)

        torch_img = np_to_var(imgs)
        if cuda:
            torch_img = torch_img.cuda()
            torch.cuda.synchronize()
        gt.stamp('transfer') if stamp else 0

        u = get_numpy(c.forward(torch_img).cpu())
        torch.cuda.synchronize()
        gt.stamp('forward') if stamp else 0

        for i, e in enumerate(envs):
            e.step(u[i, :])
        gt.stamp('step') if stamp else 0

    for i in range(10):
        step(i, False)

    gt.stamp('start')
    for i in gt.timed_for(range(100)):
        step(i)
    gt.stamp('end')

    print(gt.report(include_itrs=False, format_options=dict(itr_num_width=10)))
コード例 #22
0
 def main_loop(self, max_epochs):
     # populate replay buffer before training
     for epoch in gt.timed_for(range(max_epochs)):
         if epoch % self.args.eval_every == 0:
             self.eval_step(
                 epoch
             )  # somehow this is not deterministic for decentralized?
         self.train_step(epoch)
     self.finish_training()
コード例 #23
0
    def _train(self, env, policy, pool):
        """Perform RL training.

        Args:
            env (`rllab.Env`): Environment used for training
            policy (`Policy`): Policy used for training
            pool (`PoolBase`): Sample pool to add samples to
        """

        self._init_training(env, policy, pool)
        self.sampler.initialize(env, policy, pool)

        with self._sess.as_default():
            gt.rename_root('RLAlgorithm')
            gt.reset()
            gt.set_def_unique(False)

            for epoch in gt.timed_for(range(self._n_epochs + 1),
                                      save_itrs=True):
                logger.push_prefix('Epoch #%d | ' % epoch)

                for t in range(self._epoch_length):
                    # TODO.codeconsolidation: Add control interval to sampler
                    self.sampler.sample()
                    if not self.sampler.batch_ready():
                        continue
                    gt.stamp('sample')

                    for i in range(self._n_train_repeat):
                        self._do_training(iteration=t +
                                          epoch * self._epoch_length,
                                          batch=self.sampler.random_batch())
                    gt.stamp('train')

                self._evaluate(epoch)

                params = self.get_snapshot(epoch)
                logger.save_itr_params(epoch, params)
                times_itrs = gt.get_times().stamps.itrs

                eval_time = times_itrs['eval'][-1] if epoch > 1 else 0
                total_time = gt.get_times().total
                logger.record_tabular('time-train', times_itrs['train'][-1])
                logger.record_tabular('time-eval', eval_time)
                logger.record_tabular('time-sample', times_itrs['sample'][-1])
                logger.record_tabular('time-total', total_time)
                logger.record_tabular('epoch', epoch)

                self.sampler.log_diagnostics()

                logger.dump_tabular(with_prefix=False)
                logger.pop_prefix()

                gt.stamp('eval')

            self.sampler.terminate()
コード例 #24
0
ファイル: rl_algorithm.py プロジェクト: sra4077/softqlearning
    def _train(self, env, policy, pool):
        """Perform RL training.

        Args:
            env (`rllab.Env`): Environment used for training
            policy (`Policy`): Policy used for training
            pool (`PoolBase`): Sample pool to add samples to
        """
        self._init_training()
        self.sampler.initialize(env, policy, pool)

        evaluation_env = deep_clone(env) if self._eval_n_episodes else None

        with tf_utils.get_default_session().as_default():
            gt.rename_root('RLAlgorithm')
            gt.reset()
            gt.set_def_unique(False)

            for epoch in gt.timed_for(
                    range(self._n_epochs + 1), save_itrs=True):
                logger.push_prefix('Epoch #%d | ' % epoch)

                for t in range(self._epoch_length):
                    self.sampler.sample()
                    if not self.sampler.batch_ready():
                        continue
                    gt.stamp('sample')

                    for i in range(self._n_train_repeat):
                        self._do_training(
                            iteration=t + epoch * self._epoch_length,
                            batch=self.sampler.random_batch())
                    gt.stamp('train')

                self._evaluate(policy, evaluation_env)
                gt.stamp('eval')

                params = self.get_snapshot(epoch)
                logger.save_itr_params(epoch, params)

                time_itrs = gt.get_times().stamps.itrs
                time_eval = time_itrs['eval'][-1]
                time_total = gt.get_times().total
                time_train = time_itrs.get('train', [0])[-1]
                time_sample = time_itrs.get('sample', [0])[-1]

                logger.record_tabular('time-train', time_train)
                logger.record_tabular('time-eval', time_eval)
                logger.record_tabular('time-sample', time_sample)
                logger.record_tabular('time-total', time_total)
                logger.record_tabular('epoch', epoch)

                self.sampler.log_diagnostics()

                logger.dump_tabular(with_prefix=False)
                logger.pop_prefix()
コード例 #25
0
 def train(self):
     """Negative epochs are offline, positive epochs are online"""
     for self.epoch in gt.timed_for(
             range(self._start_epoch, self.num_epochs),
             save_itrs=True,
     ):
         self.offline_rl = self.epoch < 0
         self._begin_epoch(self.epoch)
         self._train()
         self._end_epoch(self.epoch)
コード例 #26
0
    def train(self):
        if self.min_num_steps_before_training > 0:
            for _ in range(0, self.min_num_steps_before_training,
                           self.max_path_length):
                patch_trajectory = rollout(self.expl_env, self.trainer.policy,
                                           self.trainer.qf1, self.trainer.qf2,
                                           self.max_path_length,
                                           self.rnn_seq_len)
                self.replay_buffer.add_trajectory(patch_trajectory)

        for epoch in gt.timed_for(
                range(self._start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            rewards, seen_area, total_rotate, right_rotate = eval_rollout(
                self.eval_env, self.trainer.eval_policy, epoch,
                self.num_eval_steps_per_epoch)
            self.writer.add_scalar('eval/mean_reward', np.mean(rewards), epoch)
            self.writer.add_scalar('eval/mean_sean_area', np.mean(seen_area),
                                   epoch)
            self.writer.add_scalar('eval/max_reward', np.max(rewards), epoch)
            self.writer.add_scalar('eval/max_sean_area', np.max(seen_area),
                                   epoch)
            self.writer.add_scalar('eval/min_reward', np.min(rewards), epoch)
            self.writer.add_scalar('eval/min_sean_area', np.min(seen_area),
                                   epoch)
            self.writer.add_scalar(
                'eval/mean_rotate_ratio',
                abs(0.5 - np.sum(right_rotate) / np.sum(total_rotate)), epoch)

            gt.stamp('evalution_sampling', unique=False)

            for _ in range(self.num_train_loops_per_epoch):
                for _ in range(0, self.num_expl_steps_per_train_loop,
                               self.max_path_length):
                    patch_trajectory = rollout(self.expl_env,
                                               self.trainer.policy,
                                               self.trainer.qf1,
                                               self.trainer.qf2,
                                               self.max_path_length,
                                               self.rnn_seq_len)
                    gt.stamp('exploration sampling', unique=False)

                    self.replay_buffer.add_trajectory(patch_trajectory)
                    gt.stamp('data storing', unique=False)

                self.training_mode(True)
                for _ in range(self.num_trains_per_train_loop):
                    train_batch_data = self.replay_buffer.random_batch(
                        self.batch_size)
                    self.trainer.train(train_batch_data)
                gt.stamp('training', unique=False)
                self.training_mode(False)

            self._end_epoch()
コード例 #27
0
    def _train(self):
        if self.min_num_steps_before_training > 0:
            init_expl_paths = self.expl_data_collector.collect_new_paths(
                self.max_path_length,
                self.min_num_steps_before_training,
                discard_incomplete_paths=False,
            )
            self.replay_buffer.add_paths(init_expl_paths)
            self.expl_data_collector.end_epoch(-1)
            self.estimate_obs_stats(init_expl_paths[0]['observations'],
                                    init_flag=True)

        for epoch in gt.timed_for(
                range(self._start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            self.eval_data_collector.collect_normalized_new_paths(
                self.max_path_length,
                self.num_eval_steps_per_epoch,
                discard_incomplete_paths=True,
                input_mean=self._obs_mean,
                input_std=self._obs_std,
            )
            gt.stamp('evaluation sampling')

            for _ in range(self.num_train_loops_per_epoch):
                new_expl_paths = self.expl_data_collector.collect_normalized_new_paths(
                    self.max_path_length,
                    self.num_expl_steps_per_train_loop,
                    discard_incomplete_paths=False,
                    input_mean=self._obs_mean,
                    input_std=self._obs_std,
                )
                gt.stamp('exploration sampling', unique=False)

                self.replay_buffer.add_paths(new_expl_paths)
                gt.stamp('data storing', unique=False)

                self.training_mode(True)
                for _ in range(self.num_trains_per_train_loop):
                    train_data = self.replay_buffer.random_batch(
                        self.batch_size)
                    self.estimate_obs_stats(train_data['observations'],
                                            init_flag=False)
                    train_data['observations'] = self.apply_normalize_obs(
                        train_data['observations'])
                    self.trainer.train(train_data)
                gt.stamp('training', unique=False)
                self.training_mode(False)

            self._end_epoch(epoch)
            if self.save_frequency > 0:
                if epoch % self.save_frequency == 0:
                    self.trainer.save_models(epoch)
                    self.replay_buffer.save_buffer(epoch)
コード例 #28
0
    def _train(self):
        if self.min_num_steps_before_training > 0:
            init_expl_paths = self.expl_data_collector.collect_new_paths(
                self.max_path_length,
                self.min_num_steps_before_training,
                discard_incomplete_paths=False,
            )
            self.replay_buffer.add_paths(init_expl_paths)
            self.expl_data_collector.end_epoch(-1)

        for epoch in gt.timed_for(
                range(self._start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            # curriculum update - Tenho as probabilidades novas
            if epoch >= self.min_ep_curriculum and epoch % self.curr_update_freq == 0:
                gt.stamp('curriculum update')
                self.proba = self.curr_fn(self.trainer.policy,
                                     self.trainer.qf1,
                                     self.trainer.qf2,
                                     self.evaluation_env,
                                     **self.curr_kwargs)
                self.evaluation_env.set_init_proba(self.proba)
                self.exploration_env.set_init_proba(self.proba)

                self.all_probas[epoch] = self.proba

            # Eval step
            self.curr_state = self.eval_data_collector.collect_new_paths(
                self.max_path_length,
                self.num_eval_steps_per_epoch,
                discard_incomplete_paths=True,
            )
            gt.stamp('evaluation sampling')

            for _ in range(self.num_train_loops_per_epoch):
                new_expl_paths = self.expl_data_collector.collect_new_paths(
                    self.max_path_length,
                    self.num_expl_steps_per_train_loop,
                    discard_incomplete_paths=False,
                )
                gt.stamp('exploration sampling', unique=False)

                self.replay_buffer.add_paths(new_expl_paths)
                gt.stamp('data storing', unique=False)

                self.training_mode(True)
                for _ in range(self.num_trains_per_train_loop):
                    train_data = self.replay_buffer.random_batch(
                        self.batch_size)
                    self.trainer.train(train_data)
                gt.stamp('training', unique=False)
                self.training_mode(False)

            self._end_epoch(epoch)
コード例 #29
0
    def _train(self):
        st = time.time()
        if self.min_num_steps_before_training > 0:
            init_expl_paths = self.expl_data_collector.collect_new_paths(
                self.max_path_length,
                self.min_num_steps_before_training,
                runtime_policy=self.pretrain_policy,
            )
            self.replay_buffer.add_paths(init_expl_paths)
            self.expl_data_collector.end_epoch(-1)
        self.total_train_expl_time += time.time() - st
        self.trainer.buffer = self.replay_buffer  # TODO: make a cleaner of doing this
        self.training_mode(True)
        for _ in range(self.num_pretrain_steps):
            train_data = self.replay_buffer.random_batch(self.batch_size)
            self.trainer.train(train_data)
        self.training_mode(False)

        for epoch in gt.timed_for(
            range(self._start_epoch, self.num_epochs),
            save_itrs=True,
        ):
            self.eval_data_collector.collect_new_paths(
                self.max_path_length,
                self.num_eval_steps_per_epoch,
            )
            gt.stamp("evaluation sampling")
            st = time.time()
            for _ in range(self.num_train_loops_per_epoch):
                new_expl_paths = self.expl_data_collector.collect_new_paths(
                    self.max_path_length,
                    self.num_expl_steps_per_train_loop,
                )
                gt.stamp("exploration sampling", unique=False)

                self.replay_buffer.add_paths(new_expl_paths)
                gt.stamp("data storing", unique=False)

                self.training_mode(True)
                for train_step in range(self.num_trains_per_train_loop):
                    train_data = self.replay_buffer.random_batch(self.batch_size)
                    self.trainer.train(train_data)
                gt.stamp("training", unique=False)
                self.training_mode(False)

            if self.eval_buffer:
                eval_data = self.eval_buffer.random_batch(self.batch_size)
                self.trainer.evaluate(eval_data, buffer_data=False)
                eval_data = self.replay_buffer.random_batch(self.batch_size)
                self.trainer.evaluate(eval_data, buffer_data=True)
            self.total_train_expl_time += time.time() - st

            self._end_epoch(epoch)
コード例 #30
0
    def _train(self):
        if self.min_num_steps_before_training > 0:
            init_expl_paths = self.expl_data_collector.collect_new_paths(
                self.max_path_length,
                self.min_num_steps_before_training,
                discard_incomplete_paths=False,
            )
            self.replay_buffer.add_paths(init_expl_paths)
            self.expl_data_collector.end_epoch(-1)

            self._fit_input_stats()

        for epoch in gt.timed_for(
                range(self._start_epoch, self.num_epochs),
                save_itrs=True,
        ):
            self.eval_data_collector.collect_new_paths(
                self.max_path_length,
                self.num_eval_steps_per_epoch,
                discard_incomplete_paths=True,
            )
            gt.stamp('evaluation sampling')

            self.training_mode(True)
            if self.replay_buffer.num_steps_can_sample() > 0:
                self.model_trainer.train_from_buffer(
                    self.replay_buffer,
                    max_grad_steps=self.model_max_grad_steps,
                    epochs_since_last_update=self.
                    model_epochs_since_last_update,
                )
            gt.stamp('model training', unique=False)

            for _ in range(self.num_train_loops_per_epoch):
                new_expl_paths = self.expl_data_collector.collect_new_paths(
                    self.max_path_length,
                    self.num_expl_steps_per_train_loop,
                    discard_incomplete_paths=False,
                )
                gt.stamp('exploration sampling', unique=False)

                self.replay_buffer.add_paths(new_expl_paths)
                gt.stamp('data storing', unique=False)

                self.training_mode(True)
                for _ in range(self.num_trains_per_train_loop):
                    self.trainer.train_from_paths(new_expl_paths)
                gt.stamp('training', unique=False)
                self.training_mode(False)

            self._fit_input_stats()

            self._end_epoch(epoch)
コード例 #31
0
def experiment(variant):
    E = 10
    R = 84
    cuda = True

    envs = []

    renderer = MjCudaRender(R, R)


    for e in range(E):
        env = HalfCheetahEnv()
        envs.append(env)
    c = Convnet(6, output_activation=torch.tanh, input_channels=3)
    if cuda:
        c.cuda()

    # viewer = mujoco_py.MjRenderContextOffscreen(env.sim, device_id=1)
    # env.sim.add_render_context(viewer)

    def step(stamp=True):
        imgs = []
        if i % 100 == 0:
            for e in range(E):
                envs[e].reset()
        for e in range(E):
            # img = renderer.get_cuda_tensor(envs[e].sim)
            img = envs[e].sim.render(R, R, device_id=1).transpose()
        gt.stamp('render') if stamp else 0

        # imgs =np.array(imgs)
        # torch_img = np_to_var(imgs)
        # if cuda:
        #     torch_img = torch_img.cuda()
        #     torch.cuda.synchronize()
        # gt.stamp('transfer') if stamp else 0

        # u = get_numpy(c.forward(torch_img).cpu())
        # torch.cuda.synchronize()
        # gt.stamp('forward') if stamp else 0

        # for e in range(E):
        #     envs[e].step(u[e, :])
        # gt.stamp('step') if stamp else 0

    for i in range(10):
        step(False)

    gt.stamp('start')
    for i in gt.timed_for(range(100)):
        step()
    gt.stamp('end')
コード例 #32
0
    def _train(self, env, policy, pool):
        """Perform RL training.

        Args:
            env (`rllab.Env`): Environment used for training
            policy (`Policy`): Policy used for training
            pool (`PoolBase`): Sample pool to add samples to
        """

        self._init_training(env, policy, pool)

        with self._sess.as_default():
            observation = env.reset()
            policy.reset()

            path_length = 0
            path_return = 0
            last_path_return = 0
            max_path_return = -np.inf
            n_episodes = 0
            gt.rename_root('RLAlgorithm')
            gt.reset()
            gt.set_def_unique(False)

            for epoch in gt.timed_for(
                    range(self._n_epochs + 1), save_itrs=True):
                logger.push_prefix('Epoch #%d | ' % epoch)

                if self.iter_callback is not None:
                    self.iter_callback(locals(), globals())

                for t in range(self._epoch_length):
                    iteration = t + epoch * self._epoch_length

                    action, _ = policy.get_action(observation)
                    next_ob, reward, terminal, info = env.step(action)
                    path_length += 1
                    path_return += reward

                    self.pool.add_sample(
                        observation,
                        action,
                        reward,
                        terminal,
                        next_ob,
                    )

                    if terminal or path_length >= self._max_path_length:
                        observation = env.reset()
                        policy.reset()
                        path_length = 0
                        max_path_return = max(max_path_return, path_return)
                        last_path_return = path_return

                        path_return = 0
                        n_episodes += 1

                    else:
                        observation = next_ob
                    gt.stamp('sample')

                    if self.pool.size >= self._min_pool_size:
                        for i in range(self._n_train_repeat):
                            batch = self.pool.random_batch(self._batch_size)
                            self._do_training(iteration, batch)

                    gt.stamp('train')

                self._evaluate(epoch)

                params = self.get_snapshot(epoch)
                logger.save_itr_params(epoch, params)
                times_itrs = gt.get_times().stamps.itrs

                eval_time = times_itrs['eval'][-1] if epoch > 1 else 0
                total_time = gt.get_times().total
                logger.record_tabular('time-train', times_itrs['train'][-1])
                logger.record_tabular('time-eval', eval_time)
                logger.record_tabular('time-sample', times_itrs['sample'][-1])
                logger.record_tabular('time-total', total_time)
                logger.record_tabular('epoch', epoch)
                logger.record_tabular('episodes', n_episodes)
                logger.record_tabular('max-path-return', max_path_return)
                logger.record_tabular('last-path-return', last_path_return)
                logger.record_tabular('pool-size', self.pool.size)

                logger.dump_tabular(with_prefix=False)
                logger.pop_prefix()

                gt.stamp('eval')

            env.terminate()