Example #1
0
 def _sanity_checks(self):
     """ Do assertions..."""
     assert self.steps_per_epoch % mpi_tools.num_procs() == 0
     assert self.max_ep_len <= self.local_steps_per_epoch, \
         f'Reduce number of cores ({mpi_tools.num_procs()}) or increase ' \
         f'batch size {self.steps_per_epoch}.'
     assert self.train_pi_iterations > 0
     assert self.train_v_iterations > 0
     assert isinstance(self.env, gym.Env), 'Env is not the expected type.'
Example #2
0
 def check_distributed_parameters(self) -> None:
     """Check if parameters are synchronized across all processes."""
     if mpi_tools.num_procs() > 1:
         self.logger.log(
             'Check if distributed parameters are synchronous..')
         modules = {'Policy': self.ac.pi.net, 'Value': self.ac.v.net}
         for key, module in modules.items():
             flat_params = U.get_flat_params_from(module).numpy()
             global_min = mpi_tools.mpi_min(np.sum(flat_params))
             global_max = mpi_tools.mpi_max(np.sum(flat_params))
             assert np.allclose(global_min,
                                global_max), f'{key} not synced.'
Example #3
0
    def _init_mpi(self) -> None:
        """ Initialize MPI specifics

        Returns
        -------

        """
        if mpi_tools.num_procs() > 1:
            # Avoid slowdowns from PyTorch + MPI combo.
            mpi_tools.setup_torch_for_mpi()
            dt = time.time()
            self.logger.log('INFO: Sync actor critic parameters')
            # Sync params across cores: only once necessary, grads are averaged!
            mpi_tools.sync_params(self.ac)
            self.logger.log(f'Done! (took {time.time()-dt:0.3f} sec.)')
Example #4
0
    def update(self, x) -> None:
        """ Update internals incrementally.
            Note: works for both vector and matrix inputs.

            MPI implementation according to Chan et al.[10]; see:
            https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
        """
        x = self._convert_to_torch(x)

        # ==== Input checks
        msg = f'Expected dim in [1, 2], but got dim={len(x.shape)}.'
        assert len(x.shape) == 2 or len(x.shape) == 1, msg
        if self.shape[0] > 1:  # expect matrix inputs
            msg = f'Expected obs_dim={self.shape[0]} but got: {x.shape[1]}'
            assert len(x.shape) == 2 and x.shape[1] == self.shape[0], msg
        if self.shape[0] == 1:
            assert len(x.shape) == 1, f'Expected dim=1 but got: {x.shape}'
            # reshape is necessary since mean operator reduces vector dim by one
            x = x.view((-1, 1))

        n_B = x.shape[0] * mpi_tools.num_procs()  # get batch size
        n_A = self.count.clone()
        n_AB = self.count + n_B
        batch_mean = torch.mean(x, dim=0)

        # 1) Calculate mean and average batch mean across processes
        mpi_tools.mpi_avg_torch_tensor(batch_mean)
        delta = batch_mean - self.mean
        mean_new = self.mean + delta * n_B / n_AB

        # 2) Determine variance and sync across processes
        diff = x - mean_new
        batch_var = torch.mean(diff**2, dim=0)
        mpi_tools.mpi_avg_torch_tensor(batch_var)

        # Update running terms
        M2_A = n_A * self.var
        M2_B = n_B * batch_var
        ratio = n_A * n_B / n_AB
        M2_AB = M2_A + M2_B + delta**2 * ratio

        # 3) Update parameters - access internal values with data attribute
        self.mean.data = mean_new
        self.count.data = n_AB
        new_var = M2_AB / n_AB
        self.std.data = torch.sqrt(new_var)
Example #5
0
    def check_alg(alg_name, env_id, cores):
        """" Run one epoch update with algorithm."""
        defaults = U.get_defaults_kwargs(alg=alg_name, env_id=env_id)
        defaults['epochs'] = 1
        defaults['num_mini_batches'] = 4
        defaults['steps_per_epoch'] = 1000 * mpi_tools.num_procs()
        defaults['verbose'] = False
        print(defaults['steps_per_epoch'])

        defaults['logger_kwargs'] = setup_logger_kwargs(exp_name='unittest',
                                                        seed=0,
                                                        base_dir='/var/tmp/',
                                                        datestamp=True,
                                                        level=0,
                                                        use_tensor_board=False,
                                                        verbose=False)
        alg = U.get_alg_class(alg_name, env_id, **defaults)
        # sanity check of argument passing
        assert alg.alg == alg_name, f'Expected {alg_name} but got {alg.alg}'
        # return learn_fn(env_id, **defaults)
        ac, env = alg.learn()
        return ac, env
    def eval(self, env, ac, num_evaluations):
        """ Evaluate actor critic module for given number of evaluations.
        """
        self.ac = ac
        self.ac.eval()  # disable exploration noise

        if isinstance(env, gym.Env):
            self.env = env
        elif isinstance(env, str):
            self.env = gym.make(env)
        else:
            raise TypeError('Env is not of type: str, gym.Env')

        size = mpi_tools.num_procs()
        num_local_evaluations = num_evaluations // size
        returns = np.zeros(num_local_evaluations, dtype=np.float32)
        costs = np.zeros(num_local_evaluations, dtype=np.float32)
        ep_lengths = np.zeros(num_local_evaluations, dtype=np.float32)

        for i in range(num_local_evaluations):
            returns[i], ep_lengths[i], costs[i] = self.eval_once()
        # Gather returns from all processes
        # Note: only root process owns valid data...
        returns = list(mpi_tools.gather_and_stack(returns))
        costs = list(mpi_tools.gather_and_stack(costs))

        # now write returns as column into output file...
        if mpi_tools.proc_id() == 0:
            self.write_to_file(self.ret_file, contents=returns)
            print('Saved to:', os.path.join(self.log_dir, self.ret_file_name))
            if self.log_costs:
                self.write_to_file(self.costs_file, contents=costs)
            print(f'Mean Ret: { np.mean(returns)} \t'
                  f'Mean EpLen: {np.mean(ep_lengths)} \t'
                  f'Mean Costs: {np.mean(costs)}')

        self.ac.train()  # back to train mode
        return np.array(returns), np.array(ep_lengths), np.array(costs)
Example #7
0
    def __init__(
        self,
        actor: str,
        ac_kwargs: dict,
        env_id: str,
        epochs: int,
        logger_kwargs: dict,
        adv_estimation_method: str = 'gae',
        alg='iwpg',
        check_freq: int = 25,
        entropy_coef: float = 0.01,
        gamma: float = 0.99,
        lam: float = 0.95,  # GAE scalar
        lam_c: float = 0.95,  # GAE scalar for cost estimation
        max_ep_len: int = 1000,
        max_grad_norm: float = 0.5,
        num_mini_batches: int = 16,  # used for value network training
        optimizer: str = 'Adam',  # policy optimizer
        pi_lr: float = 3e-4,
        steps_per_epoch: int = 32 * 1000,  # number global steps per epoch
        target_kl: float = 0.01,
        train_pi_iterations: int = 80,
        train_v_iterations: int = 5,
        trust_region='plain',  # used for easy filtering in plot utils
        use_cost_value_function: bool = False,
        use_entropy: bool = False,
        use_exploration_noise_anneal: bool = False,
        use_kl_early_stopping: bool = False,
        use_linear_lr_decay: bool = True,
        use_max_grad_norm: bool = False,
        use_reward_scaling: bool = True,
        use_reward_penalty: bool = False,
        use_shared_weights: bool = False,
        use_standardized_advantages: bool = False,
        use_standardized_obs: bool = True,
        verbose: bool = True,
        vf_lr: float = 1e-3,
        weight_initialization: str = 'kaiming_uniform',
        save_freq: int = 10,
        seed: int = 0,
        video_freq: int = -1,  # set to positive integer for video recording
        **kwargs  # use to log parameters from child classes
    ):
        """

        Parameters
        ----------
        actor
        ac_kwargs
        env_id
        epochs
        logger_kwargs
        adv_estimation_method
        alg
        check_freq
        entropy_coef
        gamma
        lam
        lam_c
        max_ep_len
        max_grad_norm
        num_mini_batches
        optimizer
        pi_lr
        steps_per_epoch
        target_kl
        train_pi_iterations
        train_v_iterations
        trust_region
        use_cost_value_function
        use_entropy
        use_exploration_noise_anneal
        use_kl_early_stopping
        use_linear_lr_decay
        use_max_grad_norm
        use_reward_scaling
        use_reward_penalty
        use_shared_weights
        use_standardized_advantages
        use_standardized_obs
        verbose
        vf_lr
        weight_initialization
        save_freq
        seed
        video_freq
        kwargs
        """

        # Environment calls
        # TODO: call gym.make with **kwargs (to allow customization)
        self.env = env = gym.make(env_id) if isinstance(env_id,
                                                        str) else env_id
        # Collect information from environment if it has an time wrapper
        if hasattr(self.env, '_max_episode_steps'):
            max_ep_len = self.env._max_episode_steps

        self.adv_estimation_method = adv_estimation_method
        self.alg = alg
        self.check_freq = check_freq
        self.entropy_coef = entropy_coef if use_entropy else 0.0
        self.epoch = 0  # iterated in learn method
        self.epochs = epochs
        self.lam = lam
        self.local_steps_per_epoch = steps_per_epoch // mpi_tools.num_procs()
        self.logger_kwargs = logger_kwargs
        self.max_ep_len = max_ep_len
        self.max_grad_norm = max_grad_norm
        self.num_mini_batches = num_mini_batches
        self.pi_lr = pi_lr
        self.save_freq = save_freq
        self.seed = seed
        self.steps_per_epoch = steps_per_epoch
        self.target_kl = target_kl
        self.train_pi_iterations = train_pi_iterations
        self.train_v_iterations = train_v_iterations
        self.use_cost_value_function = use_cost_value_function
        self.use_exploration_noise_anneal = use_exploration_noise_anneal
        self.use_kl_early_stopping = use_kl_early_stopping
        self.use_linear_lr_decay = use_linear_lr_decay
        self.use_max_grad_norm = use_max_grad_norm
        self.use_reward_penalty = use_reward_penalty
        self.use_reward_scaling = use_reward_scaling
        self.use_standardized_obs = use_standardized_obs
        self.use_standardized_advantages = use_standardized_advantages
        self.video_freq = video_freq
        self.vf_lr = vf_lr

        # ==== Call assertions....
        self._sanity_checks()

        # === Set up logger and save configuration to disk
        # get local parameters before logger instance to avoid unnecessary print
        self.params = locals()
        self.logger = self._init_logger()
        self.logger.save_config(self.params)

        # === Seeding
        seed += 10000 * mpi_tools.proc_id()
        torch.manual_seed(seed)
        np.random.seed(seed)
        self.env.seed(seed=seed)

        # === Setup actor-critic module
        self.ac = core.ActorCriticWithCosts(
            actor_type=actor,
            observation_space=env.observation_space,
            action_space=env.action_space,
            use_standardized_obs=use_standardized_obs,
            use_scaled_rewards=use_reward_scaling,
            use_shared_weights=use_shared_weights,
            weight_initialization=weight_initialization,
            ac_kwargs=ac_kwargs)

        # === set up MPI specifics
        self._init_mpi()

        # === Set up experience buffer
        self.buf = core.Buffer(
            actor_critic=self.ac,
            obs_dim=env.observation_space.shape,
            act_dim=env.action_space.shape,
            size=self.local_steps_per_epoch,
            gamma=gamma,
            lam=lam,
            adv_estimation_method=adv_estimation_method,
            use_scaled_rewards=use_reward_scaling,
            standardize_env_obs=use_standardized_obs,
            standardize_advantages=use_standardized_advantages,
            lam_c=lam_c,
            use_reward_penalty=use_reward_penalty,
        )

        # Set up optimizers for policy and value function
        self.pi_optimizer = core.get_optimizer(optimizer,
                                               module=self.ac.pi,
                                               lr=pi_lr)
        self.vf_optimizer = core.get_optimizer('Adam',
                                               module=self.ac.v,
                                               lr=vf_lr)
        if use_cost_value_function:
            self.cf_optimizer = core.get_optimizer('Adam',
                                                   module=self.ac.c,
                                                   lr=self.vf_lr)
        # Set up video recorder
        self.recorder = self._init_video_recorder()
        # setup scheduler for policy learning rate decay
        self.scheduler = self._init_learning_rate_scheduler()

        # Set up model saving
        self.logger.setup_torch_saver(self.ac)
        self.logger.torch_save()

        # setup statistics
        self.start_time = time.time()
        self.epoch_time = time.time()
        self.loss_pi_before = 0.0
        self.loss_v_before = 0.0
        self.loss_c_before = 0.0
        self.logger.log('Start with training.')