Exemple #1
0
    def optimize_policy(self, all_samples_data, log=True):
        """
        Performs MAML outer step

        Args:
            all_samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and
             meta task
            log (bool) : whether to log statistics

        Returns:
            None
        """
        meta_op_input_dict = self._extract_input_dict_meta_op(
            all_samples_data, self._optimization_keys)

        if log: logger.log("Optimizing")
        loss_before = self.optimizer.optimize(
            input_val_dict=meta_op_input_dict)

        if log: logger.log("Computing statistics")
        loss_after = self.optimizer.loss(input_val_dict=meta_op_input_dict)

        if log:
            logger.logkv('LossBefore', loss_before)
            logger.logkv('LossAfter', loss_after)
Exemple #2
0
    def log_diagnostics(self, paths, prefix=''):
        dist = [-path["env_infos"]['reward_dist'] for path in paths]
        final_dist = [-path["env_infos"]['reward_dist'][-1] for path in paths]
        # ctrl_cost = [-path["env_infos"]['reward_ctrl'] for path in paths]

        logger.logkv(prefix + 'AvgDistance', np.mean(dist))
        logger.logkv(prefix + 'AvgFinalDistance', np.mean(final_dist))
Exemple #3
0
 def log_diagnostics(self, paths, prefix=''):
     """
     Log extra information per iteration based on the collected paths
     """
     log_stds = np.vstack(
         [path["agent_infos"]["log_std"] for path in paths])
     logger.logkv(prefix + 'AveragePolicyStd', np.mean(np.exp(log_stds)))
Exemple #4
0
    def step(self):
        time_step = time.time()
        """ -------------------- Sampling --------------------------"""

        if self.verbose:
            logger.log("Policy is obtaining samples ...")
        paths = self.model_sampler.obtain_samples(log=True,
                                                  log_prefix='Policy-')
        """ ----------------- Processing Samples ---------------------"""

        if self.verbose:
            logger.log("Policy is processing samples ...")
        samples_data = self.model_sample_processor.process_samples(
            paths, log='all', log_prefix='Policy-')

        if type(paths) is list:
            self.log_diagnostics(paths, prefix='Policy-')
        else:
            self.log_diagnostics(sum(paths.values(), []), prefix='Policy-')
        """ ------------------ Policy Update ---------------------"""

        if self.verbose:
            logger.log("Policy optimization...")
        # This needs to take all samples_data so that it can construct graph for meta-optimization.
        self.algo.optimize_policy(samples_data,
                                  log=True,
                                  verbose=self.verbose,
                                  prefix='Policy-')

        self.policy = self.model_sampler.policy
        time_step = time.time() - time_step

        logger.logkv('Policy-TimeStep', time_step)
Exemple #5
0
    def _synch(self, samples_data_arr, check_init=False):
        time_synch = time.time()
        if self.verbose:
            logger.log('Model at {} is synchronizing...'.format(
                self.itr_counter))
        obs = np.concatenate([
            samples_data['observations'] for samples_data in samples_data_arr
        ])
        act = np.concatenate(
            [samples_data['actions'] for samples_data in samples_data_arr])
        obs_next = np.concatenate([
            samples_data['next_observations']
            for samples_data in samples_data_arr
        ])
        self.dynamics_model.update_buffer(
            obs=obs,
            act=act,
            obs_next=obs_next,
            check_init=check_init,
        )

        # Reset variables for early stopping condition
        self.with_new_data = True
        self.remaining_model_idx = list(range(self.dynamics_model.num_models))
        self.valid_loss_rolling_average = None
        time_synch = time.time() - time_synch

        logger.logkv('Model-TimeSynch', time_synch)
Exemple #6
0
    def step(self, random=False):
        time_step = time.time()
        '''------------- Obtaining samples from the environment -----------'''

        if self.verbose:
            logger.log("Data is obtaining samples...")
        env_paths = self.env_sampler.obtain_samples(
            log=True,
            random=random,
            log_prefix='Data-EnvSampler-',
        )
        '''-------------- Processing environment samples -------------------'''

        if self.verbose:
            logger.log("Data is processing environment samples...")
        samples_data = self.dynamics_sample_processor.process_samples(
            env_paths,
            log=True,
            log_prefix='Data-EnvTrajs-',
        )

        self.samples_data_arr.append(samples_data)
        time_step = time.time() - time_step

        time_sleep = max(self.simulation_sleep - time_step, 0)
        time.sleep(time_sleep)

        logger.logkv('Data-TimeStep', time_step)
        logger.logkv('Data-TimeSleep', time_sleep)
Exemple #7
0
    def push(self):
        time_push = time.time()
        self.queue_next.put(pickle.dumps(self.samples_data_arr))
        self.samples_data_arr = []
        time_push = time.time() - time_push

        logger.logkv('Data-TimePush', time_push)
Exemple #8
0
    def optimize_policy(self, all_samples_data, log=True):
        """
        Performs MAML outer step

        Args:
            all_samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and
             meta task
            log (bool) : whether to log statistics

        Returns:
            None
        """
        meta_op_input_dict = self._extract_input_dict_meta_op(
            all_samples_data, self._optimization_keys)
        logger.log("Computing KL before")
        mean_kl_before = self.optimizer.constraint_val(meta_op_input_dict)

        logger.log("Computing loss before")
        loss_before = self.optimizer.loss(meta_op_input_dict)
        logger.log("Optimizing")
        self.optimizer.optimize(meta_op_input_dict)
        logger.log("Computing loss after")
        loss_after = self.optimizer.loss(meta_op_input_dict)

        logger.log("Computing KL after")
        mean_kl = self.optimizer.constraint_val(meta_op_input_dict)
        if log:
            logger.logkv('MeanKLBefore', mean_kl_before)
            logger.logkv('MeanKL', mean_kl)

            logger.logkv('LossBefore', loss_before)
            logger.logkv('LossAfter', loss_after)
            logger.logkv('dLoss', loss_before - loss_after)
Exemple #9
0
    def _synch(self, policy_state_pickle):
        time_synch = time.time()
        policy_state = pickle.loads(policy_state_pickle)
        assert isinstance(policy_state, dict)
        self.env_sampler.policy.set_shared_params(policy_state)
        time_synch = time.time() - time_synch

        logger.logkv('Data-TimeSynch', time_synch)
Exemple #10
0
    def obtain_samples(self, log=False, log_prefix='', buffer=None):
        """
        Collect batch_size trajectories from each task

        Args:
            log (boolean): whether to log sampling times
            log_prefix (str) : prefix for logger
            random (boolean): whether the actions are random

        Returns:
            (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length)
        """

        # initial setup / preparation
        policy = self.policy
        policy.reset(dones=[True] * self.num_rollouts)

        # initial reset of meta_envs
        init_obses = np.array(
            [self.env.reset() for _ in range(self.num_rollouts)])

        sess = tf.get_default_session()
        observations, actions, means, log_stds, rewards = sess.run(
            [
                self._observations_var, self._actions_var, self._means_var,
                self._log_stds_var, self._rewards_var
            ],
            feed_dict={self._initial_obs_ph: init_obses})

        means = np.array(means).transpose((1, 0, 2))
        log_stds = np.array(log_stds).transpose((1, 0, 2))
        if log_stds.shape[0] == 1:
            log_stds = np.repeat(log_stds, self.num_rollouts, axis=0)
        agent_infos = [
            dict(mean=mean, log_std=log_std)
            for mean, log_std in zip(means, log_stds)
        ]
        observations = np.array(observations).transpose((1, 0, 2))
        actions = np.array(actions).transpose((1, 0, 2))
        rewards = np.array(rewards).T
        dones = [[False for _ in range(self.max_path_length)]
                 for _ in range(self.num_rollouts)]
        env_infos = [dict() for _ in range(self.num_rollouts)]
        paths = [
            dict(observations=obs,
                 actions=act,
                 rewards=rew,
                 dones=done,
                 env_infos=env_info,
                 agent_infos=agent_info)
            for obs, act, rew, done, env_info, agent_info in zip(
                observations, actions, rewards, dones, env_infos, agent_infos)
        ]
        self.total_timesteps_sampled += self.total_samples
        logger.logkv('ModelSampler-n_timesteps', self.total_timesteps_sampled)

        return paths
Exemple #11
0
    def log_diagnostics(self, paths, prefix=''):
        reach_rew = [path["env_infos"]['reachRew'] for path in paths]
        pick_rew = [path["env_infos"]['pickRew'][-1] for path in paths]
        place_rew = [path["env_infos"]['placeRew'] for path in paths]
        reach_dist = [path["env_infos"]['reachDist'] for path in paths]
        placing_dist = [path["env_infos"]['placingDist'] for path in paths]

        logger.logkv(prefix + 'AverageReachReward', np.mean(reach_rew))
        logger.logkv(prefix + 'AveragePickReward', np.mean(pick_rew))
        logger.logkv(prefix + 'AveragePlaceReward', np.mean(place_rew))
        logger.logkv(prefix + 'AverageReachDistance', np.mean(reach_dist))
        logger.logkv(prefix + 'AveragePlaceDistance', np.mean(placing_dist))
Exemple #12
0
    def log_diagnostics(self, paths, prefix=''):
        progs = [
            np.mean(path["env_infos"]["reward_forward"]) for path in paths
        ]
        ctrl_cost = [
            -np.mean(path["env_infos"]["reward_ctrl"]) for path in paths
        ]

        logger.logkv(prefix + 'AverageForwardReturn', np.mean(progs))
        logger.logkv(prefix + 'MaxForwardReturn', np.max(progs))
        logger.logkv(prefix + 'MinForwardReturn', np.min(progs))
        logger.logkv(prefix + 'StdForwardReturn', np.std(progs))
        logger.logkv(prefix + 'AverageCtrlCost', np.mean(ctrl_cost))
Exemple #13
0
 def optimize_policy(self, buffer, timestep, grad_steps, log=True):
     sess = tf.get_default_session()
     for i in range(grad_steps):
         feed_dict = create_feed_dict(placeholder_dict=self.op_phs_dict,
                                      value_dict=buffer.random_batch(
                                          self.sampler_batch_size))
         sess.run(self.training_ops, feed_dict)
         if log:
             diagnostics = sess.run({**self.diagnostics_ops}, feed_dict)
             for k, v in diagnostics.items():
                 logger.logkv(k, v)
         if timestep % self.target_update_interval == 0:
             self._update_target()
Exemple #14
0
    def _synch(self, dynamics_model_state_pickle):
        time_synch = time.time()
        if self.verbose:
            logger.log('Policy is synchronizing...')
        dynamics_model_state = pickle.loads(dynamics_model_state_pickle)
        assert isinstance(dynamics_model_state, dict)
        self.model_sampler.dynamics_model.set_shared_params(
            dynamics_model_state)
        if hasattr(self.model_sampler, 'vec_env'):
            self.model_sampler.vec_env.dynamics_model.set_shared_params(
                dynamics_model_state)
        time_synch = time.time() - time_synch

        logger.logkv('Policy-TimeSynch', time_synch)
Exemple #15
0
    def optimize_policy(self,
                        samples_data,
                        log=True,
                        prefix='',
                        verbose=False):
        """
        Performs MAML outer step

        Args:
            samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and
             meta task
            log (bool) : whether to log statistics

        Returns:
            None
        """
        input_dict = self._extract_input_dict(samples_data,
                                              self._optimization_keys,
                                              prefix='train')
        entropy_loss, reward_loss = self.optimizer.compute_loss_variations(
            input_dict, self.entropy_loss, self.reward_loss, self.log_values)

        if verbose: logger.log("Optimizing")

        # Update model
        loss_before = self.optimizer.optimize(input_val_dict=input_dict)

        if verbose: logger.log("Computing statistics")
        loss_after = self.optimizer.loss(input_val_dict=input_dict)

        if log:
            logger.logkv(prefix + 'Loss/LossBefore', loss_before)
            logger.logkv(prefix + 'Loss/LossAfter', loss_after)
            logger.logkv(prefix + 'Loss/PartialLossEntropy', entropy_loss)
            logger.logkv(prefix + 'Loss/PartialLossReward', reward_loss)
Exemple #16
0
    def push(self):
        time_push = time.time()
        state_pickle = pickle.dumps(
            self.dynamics_model.get_shared_param_values())
        assert state_pickle is not None
        while self.queue_next.qsize() > 5:
            try:
                logger.log('Model is off loading data from queue_next...')
                _ = self.queue_next.get_nowait()
            except Empty:
                break
        self.queue_next.put(state_pickle)
        time_push = time.time() - time_push

        logger.logkv('Model-TimePush', time_push)
Exemple #17
0
    def optimize_policy(self, all_samples_data, log=True):
        """
        Performs MAML outer step

        Args:
            all_samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and
             meta task
            log (bool) : whether to log statistics

        Returns:
            None
        """
        meta_op_input_dict = self._extract_input_dict_meta_op(
            all_samples_data, self._optimization_keys)

        # add kl_coeffs / clip_eps to meta_op_input_dict
        meta_op_input_dict['inner_kl_coeff'] = self.inner_kl_coeff
        if self.clip_outer:
            meta_op_input_dict['clip_eps'] = self.clip_eps
        else:
            meta_op_input_dict['outer_kl_coeff'] = self.outer_kl_coeff

        if log: logger.log("Optimizing")
        loss_before = self.optimizer.optimize(
            input_val_dict=meta_op_input_dict)

        if log: logger.log("Computing statistics")
        loss_after, inner_kls, outer_kl = self.optimizer.compute_stats(
            input_val_dict=meta_op_input_dict)

        if self.adaptive_inner_kl_penalty:
            if log: logger.log("Updating inner KL loss coefficients")
            self.inner_kl_coeff = self.adapt_kl_coeff(self.inner_kl_coeff,
                                                      inner_kls,
                                                      self.target_inner_step)

        if self.adaptive_outer_kl_penalty:
            if log: logger.log("Updating outer KL loss coefficients")
            self.outer_kl_coeff = self.adapt_kl_coeff(self.outer_kl_coeff,
                                                      outer_kl,
                                                      self.target_outer_step)

        if log:
            logger.logkv('LossBefore', loss_before)
            logger.logkv('LossAfter', loss_after)
            logger.logkv('KLInner', np.mean(inner_kls))
            logger.logkv('KLCoeffInner', np.mean(self.inner_kl_coeff))
            if not self.clip_outer: logger.logkv('KLOuter', outer_kl)
Exemple #18
0
    def push(self):
        time_push = time.time()
        policy_state_pickle = pickle.dumps(
            self.policy.get_shared_param_values())
        assert policy_state_pickle is not None
        while self.queue_next.qsize() > 5:
            try:
                logger.log('Policy is off loading data from queue_next...')
                _ = self.queue_next.get_nowait()
            except Empty:
                # very rare chance to reach here
                break
        self.queue_next.put(policy_state_pickle)
        time_push = time.time() - time_push

        logger.logkv('Policy-TimePush', time_push)
Exemple #19
0
    def step(self):
        time_step = time.time()
        ''' --------------- MAML steps --------------- '''

        self.policy.switch_to_pre_update()  # Switch to pre-update policy
        all_samples_data = []

        for step in range(self.num_inner_grad_steps + 1):
            if self.verbose:
                logger.log("Policy Adaptation-Step %d **" % step)
            """ -------------------- Sampling --------------------------"""

            #time_sampling = time.time()
            paths = self.model_sampler.obtain_samples(log=True,
                                                      log_prefix='Policy-',
                                                      buffer=None)
            #time_sampling = time.time() - time_sampling
            """ ----------------- Processing Samples ---------------------"""

            #time_sample_proc = time.time()
            samples_data = self.model_sample_processor.process_samples(
                paths, log='all', log_prefix='Policy-')
            all_samples_data.append(samples_data)
            #time_sample_proc = time.time() - time_sample_proc

            self.log_diagnostics(sum(list(paths.values()), []),
                                 prefix='Policy-')
            """ ------------------- Inner Policy Update --------------------"""

            #time_algo_adapt = time.time()
            if step < self.num_inner_grad_steps:
                self.algo._adapt(samples_data)
            #time_algo_adapt = time.time() - time_algo_adapt
        """ ------------------ Outer Policy Update ---------------------"""

        if self.verbose:
            logger.log("Policy is optimizing...")
        # This needs to take all samples_data so that it can construct graph for meta-optimization.
        #time_algo_opt = time.time()
        self.algo.optimize_policy(all_samples_data, prefix='Policy-')
        #time_algo_opt = time.time() - time_algo_opt

        time_step = time.time() - time_step
        self.policy = self.model_sampler.policy

        logger.logkv('Policy-TimeStep', time_step)
Exemple #20
0
    def optimize_policy(self,
                        samples_data,
                        log=True,
                        prefix='',
                        verbose=False):
        """
        Performs MAML outer step

        Args:
            samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and
             meta task
            log (bool) : whether to log statistics

        Returns:
            None
        """
        input_dict = self._extract_input_dict(samples_data,
                                              self._optimization_keys,
                                              prefix='train')

        if verbose:
            logger.log("Computing KL before")
        mean_kl_before = self.optimizer.constraint_val(
            input_val_dict=input_dict)

        if verbose:
            logger.log("Computing loss before")
        loss_before = self.optimizer.loss(input_val_dict=input_dict)
        if verbose:
            logger.log("Optimizing")
        self.optimizer.optimize(input_val_dict=input_dict)
        if verbose:
            logger.log("Computing loss after")
        loss_after = self.optimizer.loss(input_val_dict=input_dict)

        if verbose:
            logger.log("Computing KL after")
        mean_kl = self.optimizer.constraint_val(input_val_dict=input_dict)
        if log:
            logger.logkv(prefix + 'MeanKLBefore', mean_kl_before)
            logger.logkv(prefix + 'MeanKL', mean_kl)

            logger.logkv(prefix + 'LossBefore', loss_before)
            logger.logkv(prefix + 'LossAfter', loss_after)
            logger.logkv(prefix + 'dLoss', loss_before - loss_after)
Exemple #21
0
    def step(self, random=False):
        time_step = time.time()
        '''------------- Obtaining samples from the environment -----------'''

        if self.verbose:
            logger.log("Data is obtaining samples...")
        env_paths = self.env_sampler.obtain_samples(
            log=True,
            random=random,
            log_prefix='Data-EnvSampler-',
        )
        '''-------------- Processing environment samples -------------------'''

        if self.verbose:
            logger.log("Data is processing samples...")
        if type(env_paths) is dict or type(env_paths) is OrderedDict:
            env_paths = list(env_paths.values())
            idxs = np.random.choice(range(len(env_paths)),
                                    size=self.num_rollouts_per_iter,
                                    replace=False)
            env_paths = sum([env_paths[idx] for idx in idxs], [])

        elif type(env_paths) is list:
            idxs = np.random.choice(range(len(env_paths)),
                                    size=self.num_rollouts_per_iter,
                                    replace=False)
            env_paths = [env_paths[idx] for idx in idxs]

        else:
            raise TypeError
        samples_data = self.dynamics_sample_processor.process_samples(
            env_paths,
            log=True,
            log_prefix='Data-EnvTrajs-',
        )

        self.samples_data_arr.append(samples_data)
        time_step = time.time() - time_step

        time_sleep = max(self.simulation_sleep - time_step, 0)
        time.sleep(time_sleep)

        logger.logkv('Data-TimeStep', time_step)
        logger.logkv('Data-TimeSleep', time_sleep)
Exemple #22
0
    def train(self):
        """
        Trains policy on env using algo
        """
        worker_data_queue, worker_model_queue, worker_policy_queue = self.queues
        worker_data_remote, worker_model_remote, worker_policy_remote = self.remotes

        for p in self.ps:
            p.start()
        ''' --------------- worker warm-up --------------- '''

        logger.log('Prepare start...')

        worker_data_remote.send('prepare start')
        worker_data_queue.put(self.initial_random_samples)
        assert worker_data_remote.recv() == 'loop ready'

        worker_model_remote.send('prepare start')
        assert worker_model_remote.recv() == 'loop ready'

        worker_policy_remote.send('prepare start')
        assert worker_policy_remote.recv() == 'loop ready'

        time_total = time.time()
        ''' --------------- worker looping --------------- '''

        logger.log('Start looping...')
        for remote in self.remotes:
            remote.send('start loop')
        ''' --------------- collect info --------------- '''

        for remote in self.remotes:
            assert remote.recv() == 'loop done'
        logger.log('\n------------all workers exit loops -------------')
        for remote in self.remotes:
            assert remote.recv() == 'worker closed'

        for p in self.ps:
            p.terminate()

        logger.logkv('Trainer-TimeTotal', time.time() - time_total)
        logger.dumpkvs()
        logger.log("*****Training finished")
Exemple #23
0
    def process_queue(self):
        do_push = 0
        samples_data_arr = []
        while True:
            try:
                if not self.remaining_model_idx:
                    logger.log(
                        'Model at iteration {} is block waiting for data'.
                        format(self.itr_counter))
                    # FIXME: check stop_cond
                    time_wait = time.time()
                    samples_data_arr_pickle = self.queue.get()
                    time_wait = time.time() - time_wait
                    logger.logkv('Model-TimeBlockWait', time_wait)
                    self.remaining_model_idx = list(
                        range(self.dynamics_model.num_models))
                else:
                    if self.verbose:
                        logger.log('Model try get_nowait.........')
                    samples_data_arr_pickle = self.queue.get_nowait()
                if samples_data_arr_pickle == 'push':
                    # Only push once before executing another step
                    if do_push == 0:
                        do_push = 1
                        self.push()
                else:
                    samples_data_arr.extend(
                        pickle.loads(samples_data_arr_pickle))
            except Empty:
                break

        do_synch = len(samples_data_arr)
        if do_synch:
            self._synch(samples_data_arr)

        do_step = 1

        if self.verbose:
            logger.log(
                'Model finishes processing queue with {}, {}, {}......'.format(
                    do_push, do_synch, do_step))

        return do_push, do_synch, do_step
Exemple #24
0
    def step(self, obs=None, act=None, obs_next=None):
        time_model_fit = time.time()
        """ --------------- fit dynamics model --------------- """

        if self.verbose:
            logger.log(
                'Model at iteration {} is training for one epoch...'.format(
                    self.itr_counter))
        self.remaining_model_idx, self.valid_loss_rolling_average = self.dynamics_model.fit_one_epoch(
            remaining_model_idx=self.remaining_model_idx,
            valid_loss_rolling_average_prev=self.valid_loss_rolling_average,
            with_new_data=self.with_new_data,
            verbose=self.verbose,
            log_tabular=True,
            prefix='Model-',
        )
        self.with_new_data = False
        time_model_fit = time.time() - time_model_fit

        logger.logkv('Model-TimeStep', time_model_fit)
Exemple #25
0
    def optimize_supervised(self,
                            samples_data,
                            log=True,
                            prefix='',
                            verbose=False):
        input_dict = self._extract_input_dict(samples_data,
                                              self._optimization_keys,
                                              prefix='train')
        self.optimizer_s.compute_loss_variations(input_dict, None, None,
                                                 self.log_values_sup)

        if verbose: logger.log("Optimizing Supervised Model")
        loss_before = self.optimizer_s.optimize(input_val_dict=input_dict)

        if verbose: logger.log("Computing statistics")
        loss_after = self.optimizer_s.loss(input_val_dict=input_dict)

        if log:
            logger.logkv(prefix + 'SupervisedLossBefore', loss_before)
            logger.logkv(prefix + 'SupervisedLossAfter', loss_after)
Exemple #26
0
    def log_diagnostics(self, paths, prefix=''):
        fwrd_vel = [path["env_infos"]['reward_run'] for path in paths]
        final_fwrd_vel = [path["env_infos"]['reward_run'][-1] for path in paths]
        ctrl_cost = [-path["env_infos"]['reward_ctrl'] for path in paths]

        logger.logkv(prefix + 'AvgForwardVel', np.mean(fwrd_vel))
        logger.logkv(prefix + 'AvgFinalForwardVel', np.mean(final_fwrd_vel))
        logger.logkv(prefix + 'AvgCtrlCost', np.mean(ctrl_cost))
Exemple #27
0
    def log_diagnostics(paths, prefix=''):
        forward_vel = [
            np.mean(path['env_infos']['forward_vel']) for path in paths
        ]
        ctrl_cost = [
            np.mean(path['env_infos']['control_cost']) for path in paths
        ]
        # stability_cost = [np.mean(path['env_infos']['stability_cost']) for path in paths]
        path_length = [path["observations"].shape[0] for path in paths]

        logger.logkv(prefix + 'AvgForwardVel', np.mean(forward_vel))
        logger.logkv(prefix + 'StdForwardVel', np.std(forward_vel))
        logger.logkv(prefix + 'AvgCtrlCost', np.mean(ctrl_cost))
        # logger.logkv(prefix + 'AvgStabilityCost', np.mean(stability_cost))
        logger.logkv(prefix + 'AvgPathLength', np.mean(path_length))
    def log_diagnostics(self, paths, prefix=''):
        reach_dist = [path["env_infos"]['reachDist'] for path in paths]
        placing_dist = [path["env_infos"]['placeDist'] for path in paths]
        cos_dist = [path["env_infos"]['cosDist'] for path in paths]

        logger.logkv(prefix + 'AverageReachDistance', np.mean(reach_dist))
        logger.logkv(prefix + 'AveragePlaceDistance', np.mean(placing_dist))
        logger.logkv(prefix + 'AverageCosDistance', np.mean(cos_dist))
Exemple #29
0
    def step(self, random_sinusoid=(False, False)):
        time_step = time.time()

        if self.itr_counter == 1 and self.env_sampler.policy.dynamics_model.normalization is None:
            if self.verbose:
                logger.log('Data starts first step...')
            self.env_sampler.policy.dynamics_model = pickle.loads(
                self.queue.get())
            if self.verbose:
                logger.log('Data first step done...')
        '''------------- Obtaining samples from the environment -----------'''

        if self.verbose:
            logger.log("Data is obtaining samples...")
        env_paths = self.env_sampler.obtain_samples(
            log=True,
            random=random_sinusoid[0],
            sinusoid=random_sinusoid[1],
            log_prefix='Data-EnvSampler-',
        )
        '''-------------- Processing environment samples -------------------'''

        if self.verbose:
            logger.log("Data is processing samples...")
        samples_data = self.dynamics_sample_processor.process_samples(
            env_paths,
            log=True,
            log_prefix='Data-EnvTrajs-',
        )

        self.samples_data_arr.append(samples_data)
        time_step = time.time() - time_step

        time_sleep = max(self.simulation_sleep - time_step, 0)
        time.sleep(time_sleep)

        logger.logkv('Data-TimeStep', time_step)
        logger.logkv('Data-TimeSleep', time_sleep)
Exemple #30
0
 def run_supervised(self, policy, teacher_dict, tag):
     paths = self.sampler.obtain_samples(log=False, advance_curriculum=False, policy=policy,
                                         teacher_dict=teacher_dict, max_action=False)  # TODO: consider adding a flag for max_action
     samples_data = self.sample_processor.process_samples(paths, log='all', log_prefix=tag, log_teacher=self.train_with_teacher)
     advance_curriculum, avg_success, avg_accuracy = self.check_advance_curriculum_rollout(samples_data)
     logger.logkv(f"{tag}Advance", int(advance_curriculum))
     logger.logkv(f"{tag}AvgSuccess", avg_success)
     logger.logkv(f"{tag}AvgAccuracy", avg_accuracy)
     return advance_curriculum