コード例 #1
0
class BC:
    def __init__(self, expert_paths,
                 policy,
                 epochs = 5,
                 batch_size = 64,
                 lr = 1e-3,
                 optimizer = None):

        self.policy = policy
        self.expert_paths = expert_paths
        self.epochs = epochs
        self.mb_size = batch_size
        self.logger = DataLog()

        # get transformations
        observations = np.concatenate([path["observations"] for path in expert_paths])
        actions = np.concatenate([path["actions"] for path in expert_paths])
        in_shift, in_scale = np.mean(observations, axis=0), np.std(observations, axis=0)
        out_shift, out_scale = np.mean(actions, axis=0), np.std(actions, axis=0)

        # set scalings in the target policy
        self.policy.model.set_transformations(in_shift, in_scale, out_shift, out_scale)
        self.policy.old_model.set_transformations(in_shift, in_scale, out_shift, out_scale)

        # construct optimizer
        self.optimizer = torch.optim.Adam(self.policy.trainable_params, lr=lr) if optimizer is None else optimizer

    def loss(self, obs, act):
        LL, mu, log_std = self.policy.new_dist_info(obs, act)
        # minimize negative log likelihood
        return -torch.mean(LL)

    def train(self):
        observations = np.concatenate([path["observations"] for path in self.expert_paths])
        actions = np.concatenate([path["actions"] for path in self.expert_paths])

        params_before_opt = self.policy.get_param_values()
        ts = timer.time()
        num_samples = observations.shape[0]
        for ep in tqdm(range(self.epochs)):
            self.logger.log_kv('epoch', ep)
            loss_val = self.loss(observations, actions).data.numpy().ravel()[0]
            self.logger.log_kv('loss', loss_val)
            self.logger.log_kv('time', (timer.time()-ts))
            for mb in range(int(num_samples / self.mb_size)):
                rand_idx = np.random.choice(num_samples, size=self.mb_size)
                obs = observations[rand_idx]
                act = actions[rand_idx]
                self.optimizer.zero_grad()
                loss = self.loss(obs, act)
                loss.backward()
                self.optimizer.step()
        params_after_opt = self.policy.get_param_values()
        self.policy.set_param_values(params_after_opt, set_new=True, set_old=True)
        self.logger.log_kv('epoch', self.epochs)
        loss_val = self.loss(observations, actions).data.numpy().ravel()[0]
        self.logger.log_kv('loss', loss_val)
        self.logger.log_kv('time', (timer.time()-ts))
コード例 #2
0
def train(env, policy, normalizer, hp, parentPipes, args):
    logger = DataLog()
    total_steps = 0
    best_return = -99999999
    if os.path.isdir(args.logdir) == False:
        os.mkdir(args.logdir)
    previous_dir = os.getcwd()
    os.chdir(args.logdir)
    if os.path.isdir('iterations') == False: os.mkdir('iterations')
    if os.path.isdir('logs') == False: os.mkdir('logs')
    hp.to_text('hyperparameters')

    for step in range(hp.nb_steps):

        # Initializing the perturbations deltas and the positive/negative rewards
        deltas = policy.sample_deltas()
        positive_rewards = [0] * hp.nb_directions
        negative_rewards = [0] * hp.nb_directions
        if (parentPipes):
            process_count = len(parentPipes)
        if parentPipes:
            p = 0
            while (p < hp.nb_directions):
                temp_p = p
                n_left = hp.nb_directions - p  #Number of processes required to complete the search
                for k in range(min([process_count, n_left])):
                    parentPipe = parentPipes[k]
                    parentPipe.send([
                        _EXPLORE,
                        [normalizer, policy, hp, "positive", deltas[temp_p]]
                    ])
                    temp_p = temp_p + 1
                temp_p = p
                for k in range(min([process_count, n_left])):
                    positive_rewards[temp_p], step_count = parentPipes[k].recv(
                    )
                    total_steps = total_steps + step_count
                    temp_p = temp_p + 1
                temp_p = p

                for k in range(min([process_count, n_left])):
                    parentPipe = parentPipes[k]
                    parentPipe.send([
                        _EXPLORE,
                        [normalizer, policy, hp, "negative", deltas[temp_p]]
                    ])
                    temp_p = temp_p + 1
                temp_p = p

                for k in range(min([process_count, n_left])):
                    negative_rewards[temp_p], step_count = parentPipes[k].recv(
                    )
                    total_steps = total_steps + step_count
                    temp_p = temp_p + 1
                p = p + process_count

                # print('mp step has worked, ', p)
                print('total steps till now: ', total_steps,
                      'Processes done: ', p)

        else:
            # Getting the positive rewards in the positive directions
            for k in range(hp.nb_directions):
                positive_rewards[k] = explore(env, policy, "positive",
                                              deltas[k], hp)

            # Getting the negative rewards in the negative/opposite directions
            for k in range(hp.nb_directions):
                negative_rewards[k] = explore(env, policy, "negative",
                                              deltas[k], hp)

        # Sorting the rollouts by the max(r_pos, r_neg) and selecting the best directions
        scores = {
            k: max(r_pos, r_neg)
            for k, (
                r_pos,
                r_neg) in enumerate(zip(positive_rewards, negative_rewards))
        }
        order = sorted(scores.keys(),
                       key=lambda x: -scores[x])[:int(hp.nb_best_directions)]
        rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k])
                    for k in order]

        # Gathering all the positive/negative rewards to compute the standard deviation of these rewards
        all_rewards = np.array([x[0]
                                for x in rollouts] + [x[1] for x in rollouts])
        sigma_r = all_rewards.std(
        )  # Standard deviation of only rewards in the best directions is what it should be
        # Updating our policy
        policy.update(rollouts, sigma_r, args)

        # Printing the final reward of the policy after the update
        reward_evaluation = explore(env, policy, None, None, hp)
        logger.log_kv('steps', total_steps)
        logger.log_kv('return', reward_evaluation)
        if (reward_evaluation > best_return):
            best_policy = policy.theta
            best_return = reward_evaluation
            np.save("iterations/best_policy.npy", best_policy)
        print('Step:', step, 'Reward:', reward_evaluation)
        policy_path = "iterations/" + "policy_" + str(step)
        np.save(policy_path, policy.theta)
        logger.save_log('logs/')
        make_train_plots_ars(log=logger.log,
                             keys=['steps', 'return'],
                             save_loc='logs/')
コード例 #3
0
ファイル: trpo.py プロジェクト: yaoxinyuan/pybRL
class TRPO(NPG):
    def __init__(self, env, policy, baseline,
                 kl_dist=0.01,
                 FIM_invert_args={'iters': 10, 'damping': 1e-4},
                 hvp_sample_frac=1.0,
                 seed=None,
                 save_logs=False,
                 normalized_step_size=0.01):
        """
        All inputs are expected in pybRL's format unless specified
        :param normalized_step_size: Normalized step size (under the KL metric). Twice the desired KL distance
        :param kl_dist: desired KL distance between steps. Overrides normalized_step_size.
        :param const_learn_rate: A constant learn rate under the L2 metric (won't work very well)
        :param FIM_invert_args: {'iters': # cg iters, 'damping': regularization amount when solving with CG
        :param hvp_sample_frac: fraction of samples (>0 and <=1) to use for the Fisher metric (start with 1 and reduce if code too slow)
        :param seed: random seed
        """

        self.env = env
        self.policy = policy
        self.baseline = baseline
        self.kl_dist = kl_dist if kl_dist is not None else 0.5*normalized_step_size
        self.seed = seed
        self.save_logs = save_logs
        self.FIM_invert_args = FIM_invert_args
        self.hvp_subsample = hvp_sample_frac
        self.running_score = None
        if save_logs: self.logger = DataLog()

    def train_from_paths(self, paths):

        # Concatenate from all the trajectories
        observations = np.concatenate([path["observations"] for path in paths])
        actions = np.concatenate([path["actions"] for path in paths])
        advantages = np.concatenate([path["advantages"] for path in paths])
        # Advantage whitening
        advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6)
        # NOTE : advantage should be zero mean in expectation
        # normalized step size invariant to advantage scaling,
        # but scaling can help with least squares

        # cache return distributions for the paths
        path_returns = [sum(p["rewards"]) for p in paths]
        mean_return = np.mean(path_returns)
        std_return = np.std(path_returns)
        min_return = np.amin(path_returns)
        max_return = np.amax(path_returns)
        base_stats = [mean_return, std_return, min_return, max_return]
        self.running_score = mean_return if self.running_score is None else \
                             0.9*self.running_score + 0.1*mean_return  # approx avg of last 10 iters
        if self.save_logs: self.log_rollout_statistics(paths)

        # Keep track of times for various computations
        t_gLL = 0.0
        t_FIM = 0.0

        # Optimization algorithm
        # --------------------------
        surr_before = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0]

        # VPG
        ts = timer.time()
        vpg_grad = self.flat_vpg(observations, actions, advantages)
        t_gLL += timer.time() - ts

        # NPG
        ts = timer.time()
        hvp = self.build_Hvp_eval([observations, actions],
                                  regu_coef=self.FIM_invert_args['damping'])
        npg_grad = cg_solve(hvp, vpg_grad, x_0=vpg_grad.copy(),
                            cg_iters=self.FIM_invert_args['iters'])
        t_FIM += timer.time() - ts

        # Step size computation
        # --------------------------
        n_step_size = 2.0*self.kl_dist
        alpha = np.sqrt(np.abs(n_step_size / (np.dot(vpg_grad.T, npg_grad) + 1e-20)))

        # Policy update
        # --------------------------
        curr_params = self.policy.get_param_values()
        for k in range(100):
            new_params = curr_params + alpha * npg_grad
            self.policy.set_param_values(new_params, set_new=True, set_old=False)
            kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0]
            surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0]
            if kl_dist < self.kl_dist:
                break
            else:
                alpha = 0.9*alpha # backtrack
                print("Step size too high. Backtracking. | kl = %f | surr diff = %f" % \
                      (kl_dist, surr_after-surr_before) )
            if k == 99:
                alpha = 0.0

        new_params = curr_params + alpha * npg_grad
        self.policy.set_param_values(new_params, set_new=True, set_old=False)
        kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0]
        surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0]
        self.policy.set_param_values(new_params, set_new=True, set_old=True)

        # Log information
        if self.save_logs:
            self.logger.log_kv('alpha', alpha)
            self.logger.log_kv('delta', n_step_size)
            self.logger.log_kv('time_vpg', t_gLL)
            self.logger.log_kv('time_npg', t_FIM)
            self.logger.log_kv('kl_dist', kl_dist)
            self.logger.log_kv('surr_improvement', surr_after - surr_before)
            self.logger.log_kv('running_score', self.running_score)
            try:
                self.env.env.env.evaluate_success(paths, self.logger)
            except:
                # nested logic for backwards compatibility. TODO: clean this up.
                try:
                    success_rate = self.env.env.env.evaluate_success(paths)
                    self.logger.log_kv('success_rate', success_rate)
                except:
                    pass

        return base_stats
コード例 #4
0
ファイル: ars_new.py プロジェクト: yaoxinyuan/pybRL
def train(env, policy, normalizer, hp, job_name="default_exp"):
    """
    Training using Augmented Random Search
    :param env : OpenAI gym environment
    :param policy : Object of class Policy
    :param normalizer : Object of class normalizer
    :param hp : Object of class hp
    :param job_name : Name of the directory where you want to save data
    :returns : Nothing, trains the agent  
    """
    logger = DataLog()
    total_steps = 0
    best_return = -99999999
    if os.path.isdir(job_name) == False:
        os.mkdir(job_name)
    previous_dir = os.getcwd()
    os.chdir(job_name)
    if os.path.isdir('iterations') == False: os.mkdir('iterations')
    if os.path.isdir('logs') == False: os.mkdir('logs')
    hp.to_text('hyperparameters')
    for step in range(hp.nb_steps):
        # Initializing the perturbations deltas and the positive/negative rewards
        deltas = policy.sample_deltas(hp)
        positive_rewards = [0] * hp.nb_directions
        negative_rewards = [0] * hp.nb_directions

        # Getting the positive rewards in the positive directions
        for k in range(hp.nb_directions):
            positive_rewards[k], step_count_positive = explore(
                env, normalizer, policy, "positive", deltas[k], hp)
            # break
            # print('done: ',k)
        # Getting the negative rewards in the negative/opposite directions
        for k in range(hp.nb_directions):
            negative_rewards[k], step_count_negative = explore(
                env, normalizer, policy, "negative", deltas[k], hp)
            # break
            # print('done: ', k)
        total_steps = total_steps + step_count_positive + step_count_negative

        # Sorting the rollouts by the max(r_pos, r_neg) and selecting the best directions
        scores = {
            k: max(r_pos, r_neg)
            for k, (
                r_pos,
                r_neg) in enumerate(zip(positive_rewards, negative_rewards))
        }
        order = sorted(scores.keys(),
                       key=lambda x: -scores[x])[:hp.nb_best_directions]
        rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k])
                    for k in order]

        # Gathering all the positive/negative rewards to compute the standard deviation of these rewards
        all_rewards = np.array([x[0]
                                for x in rollouts] + [x[1] for x in rollouts])
        sigma_r = all_rewards.std(
        )  # Standard deviation of only rewards in the best directions is what it should be

        # Updating our policy
        policy.update(rollouts, sigma_r, args)
        # Printing the final reward of the policy after the update
        reward_evaluation, _ = explore(env, normalizer, policy, None, None, hp)
        logger.log_kv('steps', total_steps)
        logger.log_kv('return', reward_evaluation)
        if (reward_evaluation > best_return):
            best_policy = policy.theta
            best_return = reward_evaluation
            np.save("iterations/best_policy.npy", best_policy)
        print('Step:', step, 'Reward:', reward_evaluation)
        policy_path = "iterations/" + "policy_" + str(step)
        np.save(policy_path, policy.theta)
        logger.save_log('logs/')
        make_train_plots_ars(log=logger.log,
                             keys=['steps', 'return'],
                             save_loc='logs/')
コード例 #5
0
class DAPG(NPG):
    def __init__(
            self,
            env,
            policy,
            baseline,
            demo_paths=None,
            normalized_step_size=0.01,
            FIM_invert_args={
                'iters': 10,
                'damping': 1e-4
            },
            hvp_sample_frac=1.0,
            seed=None,
            save_logs=False,
            kl_dist=None,
            lam_0=1.0,  # demo coef
            lam_1=0.95,  # decay coef
    ):

        self.env = env
        self.policy = policy
        self.baseline = baseline
        self.kl_dist = kl_dist if kl_dist is not None else 0.5 * normalized_step_size
        self.seed = seed
        self.save_logs = save_logs
        self.FIM_invert_args = FIM_invert_args
        self.hvp_subsample = hvp_sample_frac
        self.running_score = None
        self.demo_paths = demo_paths
        self.lam_0 = lam_0
        self.lam_1 = lam_1
        self.iter_count = 0.0
        if save_logs: self.logger = DataLog()

    def train_from_paths(self, paths):

        # Concatenate from all the trajectories
        observations = np.concatenate([path["observations"] for path in paths])
        actions = np.concatenate([path["actions"] for path in paths])
        advantages = np.concatenate([path["advantages"] for path in paths])
        advantages = (advantages - np.mean(advantages)) / (np.std(advantages) +
                                                           1e-6)

        if self.demo_paths is not None and self.lam_0 > 0.0:
            demo_obs = np.concatenate(
                [path["observations"] for path in self.demo_paths])
            demo_act = np.concatenate(
                [path["actions"] for path in self.demo_paths])
            demo_adv = self.lam_0 * (self.lam_1**self.iter_count) * np.ones(
                demo_obs.shape[0])
            self.iter_count += 1
            # concatenate all
            all_obs = np.concatenate([observations, demo_obs])
            all_act = np.concatenate([actions, demo_act])
            all_adv = 1e-2 * np.concatenate(
                [advantages / (np.std(advantages) + 1e-8), demo_adv])
        else:
            all_obs = observations
            all_act = actions
            all_adv = advantages

        # cache return distributions for the paths
        path_returns = [sum(p["rewards"]) for p in paths]
        mean_return = np.mean(path_returns)
        std_return = np.std(path_returns)
        min_return = np.amin(path_returns)
        max_return = np.amax(path_returns)
        base_stats = [mean_return, std_return, min_return, max_return]
        self.running_score = mean_return if self.running_score is None else \
                             0.9*self.running_score + 0.1*mean_return  # approx avg of last 10 iters
        if self.save_logs: self.log_rollout_statistics(paths)

        # Keep track of times for various computations
        t_gLL = 0.0
        t_FIM = 0.0

        # Optimization algorithm
        # --------------------------
        surr_before = self.CPI_surrogate(observations, actions,
                                         advantages).data.numpy().ravel()[0]

        # DAPG
        ts = timer.time()
        sample_coef = all_adv.shape[0] / advantages.shape[0]
        dapg_grad = sample_coef * self.flat_vpg(all_obs, all_act, all_adv)
        t_gLL += timer.time() - ts

        # NPG
        ts = timer.time()
        hvp = self.build_Hvp_eval([observations, actions],
                                  regu_coef=self.FIM_invert_args['damping'])
        npg_grad = cg_solve(hvp,
                            dapg_grad,
                            x_0=dapg_grad.copy(),
                            cg_iters=self.FIM_invert_args['iters'])
        t_FIM += timer.time() - ts

        # Step size computation
        # --------------------------
        n_step_size = 2.0 * self.kl_dist
        alpha = np.sqrt(
            np.abs(n_step_size / (np.dot(dapg_grad.T, npg_grad) + 1e-20)))

        # Policy update
        # --------------------------
        curr_params = self.policy.get_param_values()
        new_params = curr_params + alpha * npg_grad
        self.policy.set_param_values(new_params, set_new=True, set_old=False)
        surr_after = self.CPI_surrogate(observations, actions,
                                        advantages).data.numpy().ravel()[0]
        kl_dist = self.kl_old_new(observations,
                                  actions).data.numpy().ravel()[0]
        self.policy.set_param_values(new_params, set_new=True, set_old=True)

        # Log information
        if self.save_logs:
            self.logger.log_kv('alpha', alpha)
            self.logger.log_kv('delta', n_step_size)
            self.logger.log_kv('time_vpg', t_gLL)
            self.logger.log_kv('time_npg', t_FIM)
            self.logger.log_kv('kl_dist', kl_dist)
            self.logger.log_kv('surr_improvement', surr_after - surr_before)
            self.logger.log_kv('running_score', self.running_score)
            try:
                self.env.env.env.evaluate_success(paths, self.logger)
            except:
                # nested logic for backwards compatibility. TODO: clean this up.
                try:
                    success_rate = self.env.env.env.evaluate_success(paths)
                    self.logger.log_kv('success_rate', success_rate)
                except:
                    pass
        return base_stats
コード例 #6
0
ファイル: batch_reinforce.py プロジェクト: yaoxinyuan/pybRL
class BatchREINFORCE:
    def __init__(self,
                 env,
                 policy,
                 baseline,
                 learn_rate=0.01,
                 seed=None,
                 save_logs=False):

        self.env = env
        self.policy = policy
        self.baseline = baseline
        self.alpha = learn_rate
        self.seed = seed
        self.save_logs = save_logs
        self.running_score = None
        if save_logs: self.logger = DataLog()

    def compute_reinforce_loss(self, observations, actions, advantages):
        """
        Computes the REINFORCE function loss. 
        :param observations : A list containing observations of all paths together
        :param actions : A list containing actions of all paths together
        :param advantages : A list containing advantages of all paths together
        :return : Loss of all the paths combined (Loss according to reinforce algorithm log(policy) * advantage)
        """
        adv_var = Variable(torch.from_numpy(advantages).float(),
                           requires_grad=False)
        old_dist_info = self.policy.old_dist_info(observations, actions)
        new_dist_info = self.policy.new_dist_info(observations, actions)
        log_policy = self.policy.likelihood_ratio(new_dist_info, old_dist_info)
        loss = torch.mean(log_policy * adv_var)
        return loss

    def kl_old_new(self, observations, actions):
        old_dist_info = self.policy.old_dist_info(observations, actions)
        new_dist_info = self.policy.new_dist_info(observations, actions)
        mean_kl = self.policy.mean_kl(new_dist_info, old_dist_info)
        return mean_kl

    def flat_vpg(self, observations, actions, advantages):
        """
        Finds the gradients with respect to the REINFORCE loss
        :param observations : A list containing observations of all paths together
        :param actions : A list containing actions of all paths together
        :param advantages : A list containing advantages of all paths together
        :return : A flat list containing gradients of the weights (as calculated by PyTorch)
        """
        loss = self.compute_reinforce_loss(observations, actions, advantages)
        if (np.isinf(loss.data.numpy())):
            pdb.set_trace()
        vpg_grad = torch.autograd.grad(loss, self.policy.trainable_params)
        vpg_grad = np.concatenate(
            [g.contiguous().view(-1).data.numpy() for g in vpg_grad])
        return vpg_grad

    def train_step(self,
                   N,
                   sample_mode='trajectories',
                   env_name=None,
                   T=1e6,
                   gamma=0.995,
                   gae_lambda=0.98,
                   num_cpu='max'):
        """
        The agent performs a single complete step. It samples trajectories, finds gradients and updates it's weights based on those gradients
        :param N : Number of runs to get
        :param sample_mode : If trajectories, it uses trajectory sampler (N= 5, implies 5 different trajectories)
                             If samples, it uses batch smapler (N = 5, implies 5 different samples only)
        :param env_name : Name of env
        :param T : Maximum length of trajectory
        :param gamma : Discount Factor
        :param gae_lambda : Eligibility trace
        :param num_cpu : Number of cores to use (On real systems has to be 1)
        :return : A set of statistics regarding the current step. It returns [Mean_Return, Standard_Return, Min_Return, Max_Return]
        """
        # Clean up input arguments
        if env_name is None: env_name = self.env.env_id
        if sample_mode != 'trajectories' and sample_mode != 'samples':
            print(
                "sample_mode in NPG must be either 'trajectories' or 'samples'"
            )
            quit()

        ts = timer.time()

        if sample_mode == 'trajectories':
            paths = trajectory_sampler.sample_paths_parallel(
                N, self.policy, T, env_name, self.seed, num_cpu)
        elif sample_mode == 'samples':
            paths = batch_sampler.sample_paths(N,
                                               self.policy,
                                               T,
                                               env_name=env_name,
                                               pegasus_seed=self.seed,
                                               num_cpu=num_cpu)

        if self.save_logs:
            self.logger.log_kv('time_sampling', timer.time() - ts)

        self.seed = self.seed + N if self.seed is not None else self.seed

        # compute returns
        process_samples.compute_returns(paths, gamma)
        # compute advantages
        process_samples.compute_advantages(paths, self.baseline, gamma,
                                           gae_lambda)
        # train from paths
        eval_statistics = self.train_from_paths(paths)
        eval_statistics.append(N)
        # fit baseline
        if self.save_logs:
            ts = timer.time()
            error_before, error_after = self.baseline.fit(paths,
                                                          return_errors=True)
            self.logger.log_kv('time_VF', timer.time() - ts)
            self.logger.log_kv('VF_error_before', error_before)
            self.logger.log_kv('VF_error_after', error_after)
        else:
            self.baseline.fit(paths)

        return eval_statistics

    # ----------------------------------------------------------
    def train_from_paths(self, paths):
        """ Performs the gradient step for a given set of paths
        :param paths : Paths refers to a list of dictionaries as output by samplers
        :return : Returns a list that contains statistics for the paths. [Mean_Return, Standard_Return, Min_Return, Max_Return]
        The gradient update step is performed internally
        """
        # Concatenate from all the trajectories
        observations = np.concatenate([path["observations"] for path in paths])
        actions = np.concatenate([path["actions"] for path in paths])
        advantages = np.concatenate([path["advantages"] for path in paths])
        # Advantage whitening
        advantages = (advantages - np.mean(advantages)) / (np.std(advantages) +
                                                           1e-6)

        # cache return distributions for the paths
        path_returns = [sum(p["rewards"]) for p in paths]
        mean_return = np.mean(path_returns)
        std_return = np.std(path_returns)
        min_return = np.amin(path_returns)
        max_return = np.amax(path_returns)
        base_stats = [mean_return, std_return, min_return, max_return]
        self.running_score = mean_return if self.running_score is None else \
                             0.9*self.running_score + 0.1*mean_return  # approx avg of last 10 iters
        if self.save_logs: self.log_rollout_statistics(paths)

        # Keep track of times for various computations
        t_gLL = 0.0

        # Optimization algorithm
        # --------------------------
        loss_before_training = self.compute_reinforce_loss(
            observations, actions, advantages).data.numpy().ravel()[0]

        # VPG
        ts = timer.time()
        vpg_grad = self.flat_vpg(observations, actions, advantages)
        t_gLL += timer.time() - ts

        # Policy update
        # --------------------------
        curr_params = self.policy.get_param_values()
        new_params = curr_params + self.alpha * vpg_grad
        self.policy.set_param_values(new_params, set_new=True, set_old=False)
        loss_after_training = self.compute_reinforce_loss(
            observations, actions, advantages).data.numpy().ravel()[0]
        kl_dist = self.kl_old_new(observations,
                                  actions).data.numpy().ravel()[0]
        self.policy.set_param_values(new_params, set_new=True, set_old=True)

        # Log information
        if self.save_logs:
            self.logger.log_kv('alpha', self.alpha)
            self.logger.log_kv('time_vpg', t_gLL)
            self.logger.log_kv('kl_dist', kl_dist)
            self.logger.log_kv('loss_improvement',
                               loss_after_training - loss_before_training)
            self.logger.log_kv('running_score', self.running_score)
            try:
                self.env.env.env.evaluate_success(paths, self.logger)
            except:
                # nested logic for backwards compatibility. TODO: clean this up.
                try:
                    success_rate = self.env.env.env.evaluate_success(paths)
                    self.logger.log_kv('success_rate', success_rate)
                except:
                    pass

        return base_stats

    def log_rollout_statistics(self, paths):
        path_returns = [sum(p["rewards"]) for p in paths]
        mean_return = np.mean(path_returns)
        std_return = np.std(path_returns)
        min_return = np.amin(path_returns)
        max_return = np.amax(path_returns)
        self.logger.log_kv('stoc_pol_mean', mean_return)
        self.logger.log_kv('stoc_pol_std', std_return)
        self.logger.log_kv('stoc_pol_max', max_return)
        self.logger.log_kv('stoc_pol_min', min_return)
コード例 #7
0
class BC:
    def __init__(self,
                 expert_paths,
                 policy,
                 epochs=5,
                 batch_size=64,
                 lr=1e-3,
                 optimizer=None):

        self.policy = policy
        self.expert_paths = expert_paths
        self.epochs = epochs
        self.mb_size = batch_size
        self.logger = DataLog()

        # get transformations
        observations = np.concatenate(
            [path["observations"] for path in expert_paths])
        actions = np.concatenate([path["actions"] for path in expert_paths])
        in_shift, in_scale = np.mean(observations,
                                     axis=0), np.std(observations, axis=0)
        out_shift, out_scale = np.mean(actions, axis=0), np.std(actions,
                                                                axis=0)

        # set scalings in the target policy
        self.policy.model.set_transformations(in_shift, in_scale, out_shift,
                                              out_scale)
        self.policy.old_model.set_transformations(in_shift, in_scale,
                                                  out_shift, out_scale)

        # set the variance of gaussian policy based on out_scale
        params = self.policy.get_param_values()
        params[-self.policy.m:] = np.log(out_scale + 1e-12)
        self.policy.set_param_values(params)

        # construct optimizer
        self.optimizer = torch.optim.Adam(
            self.policy.model.parameters(),
            lr=lr) if optimizer is None else optimizer

        # loss criterion is MSE for maximum likelihood estimation
        self.loss_function = torch.nn.MSELoss()

    def loss(self, obs, act):
        obs_var = Variable(torch.from_numpy(obs).float(), requires_grad=False)
        act_var = Variable(torch.from_numpy(act).float(), requires_grad=False)
        act_hat = self.policy.model(obs_var)
        return self.loss_function(act_hat, act_var.detach())

    def train(self):
        observations = np.concatenate(
            [path["observations"] for path in self.expert_paths])
        actions = np.concatenate(
            [path["actions"] for path in self.expert_paths])

        params_before_opt = self.policy.get_param_values()
        ts = timer.time()
        num_samples = observations.shape[0]
        for ep in tqdm(range(self.epochs)):
            self.logger.log_kv('epoch', ep)
            loss_val = self.loss(observations, actions).data.numpy().ravel()[0]
            self.logger.log_kv('loss', loss_val)
            self.logger.log_kv('time', (timer.time() - ts))
            for mb in range(int(num_samples / self.mb_size)):
                rand_idx = np.random.choice(num_samples, size=self.mb_size)
                obs = observations[rand_idx]
                act = actions[rand_idx]
                self.optimizer.zero_grad()
                loss = self.loss(obs, act)
                loss.backward()
                self.optimizer.step()
        params_after_opt = self.policy.get_param_values()
        self.policy.set_param_values(params_after_opt,
                                     set_new=True,
                                     set_old=True)
        self.logger.log_kv('epoch', self.epochs)
        loss_val = self.loss(observations, actions).data.numpy().ravel()[0]
        self.logger.log_kv('loss', loss_val)
        self.logger.log_kv('time', (timer.time() - ts))
コード例 #8
0
class PPO(BatchREINFORCE):
    def __init__(self,
                 env,
                 policy,
                 baseline,
                 clip_coef=0.2,
                 epochs=10,
                 mb_size=64,
                 learn_rate=3e-4,
                 seed=0,
                 save_logs=False):

        self.env = env
        self.policy = policy
        self.baseline = baseline
        self.learn_rate = learn_rate
        self.seed = seed
        self.save_logs = save_logs
        self.clip_coef = clip_coef
        self.epochs = epochs
        self.mb_size = mb_size
        self.running_score = None
        if save_logs: self.logger = DataLog()

        self.optimizer = torch.optim.Adam(self.policy.trainable_params,
                                          lr=learn_rate)

    def PPO_surrogate(self, observations, actions, advantages):
        adv_var = Variable(torch.from_numpy(advantages).float(),
                           requires_grad=False)
        old_dist_info = self.policy.old_dist_info(observations, actions)
        new_dist_info = self.policy.new_dist_info(observations, actions)
        LR = self.policy.likelihood_ratio(new_dist_info, old_dist_info)
        LR_clip = torch.clamp(LR,
                              min=1 - self.clip_coef,
                              max=1 + self.clip_coef)
        ppo_surr = torch.mean(torch.min(LR * adv_var, LR_clip * adv_var))
        return ppo_surr

    # ----------------------------------------------------------
    def train_from_paths(self, paths):

        # Concatenate from all the trajectories
        observations = np.concatenate([path["observations"] for path in paths])
        actions = np.concatenate([path["actions"] for path in paths])
        advantages = np.concatenate([path["advantages"] for path in paths])
        # Advantage whitening
        advantages = (advantages - np.mean(advantages)) / (np.std(advantages) +
                                                           1e-6)
        # NOTE : advantage should be zero mean in expectation
        # normalized step size invariant to advantage scaling,
        # but scaling can help with least squares

        # cache return distributions for the paths
        path_returns = [sum(p["rewards"]) for p in paths]
        mean_return = np.mean(path_returns)
        std_return = np.std(path_returns)
        min_return = np.amin(path_returns)
        max_return = np.amax(path_returns)
        base_stats = [mean_return, std_return, min_return, max_return]
        self.running_score = mean_return if self.running_score is None else \
                             0.9*self.running_score + 0.1*mean_return  # approx avg of last 10 iters
        if self.save_logs: self.log_rollout_statistics(paths)

        # Optimization algorithm
        # --------------------------
        surr_before = self.compute_reinforce_loss(
            observations, actions, advantages).data.numpy().ravel()[0]
        params_before_opt = self.policy.get_param_values()

        ts = timer.time()
        num_samples = observations.shape[0]
        for ep in range(self.epochs):
            for mb in range(int(num_samples / self.mb_size)):
                rand_idx = np.random.choice(num_samples, size=self.mb_size)
                obs = observations[rand_idx]
                act = actions[rand_idx]
                adv = advantages[rand_idx]
                self.optimizer.zero_grad()
                loss = -self.PPO_surrogate(obs, act, adv)
                loss.backward()
                self.optimizer.step()

        params_after_opt = self.policy.get_param_values()
        surr_after = self.compute_reinforce_loss(
            observations, actions, advantages).data.numpy().ravel()[0]
        kl_dist = self.kl_old_new(observations,
                                  actions).data.numpy().ravel()[0]
        self.policy.set_param_values(params_after_opt,
                                     set_new=True,
                                     set_old=True)
        t_opt = timer.time() - ts

        # Log information
        if self.save_logs:
            self.logger.log_kv('t_opt', t_opt)
            self.logger.log_kv('kl_dist', kl_dist)
            self.logger.log_kv('surr_improvement', surr_after - surr_before)
            self.logger.log_kv('running_score', self.running_score)
            try:
                self.env.env.env.evaluate_success(paths, self.logger)
            except:
                # nested logic for backwards compatibility. TODO: clean this up.
                try:
                    success_rate = self.env.env.env.evaluate_success(paths)
                    self.logger.log_kv('success_rate', success_rate)
                except:
                    pass

        return base_stats
コード例 #9
0
class NPG(BatchREINFORCE):
    def __init__(self,
                 env,
                 policy,
                 baseline,
                 normalized_step_size=0.01,
                 const_learn_rate=None,
                 FIM_invert_args={
                     'iters': 10,
                     'damping': 1e-4
                 },
                 hvp_sample_frac=1.0,
                 seed=None,
                 save_logs=False,
                 kl_dist=None):
        """
        All inputs are expected in pybRL's format unless specified
        :param normalized_step_size: Normalized step size (under the KL metric). Twice the desired KL distance
        :param kl_dist: desired KL distance between steps. Overrides normalized_step_size.
        :param const_learn_rate: A constant learn rate under the L2 metric (won't work very well)
        :param FIM_invert_args: {'iters': # cg iters, 'damping': regularization amount when solving with CG
        :param hvp_sample_frac: fraction of samples (>0 and <=1) to use for the Fisher metric (start with 1 and reduce if code too slow)
        :param seed: random seed
        """

        self.env = env
        self.policy = policy
        self.baseline = baseline
        self.alpha = const_learn_rate
        self.n_step_size = normalized_step_size if kl_dist is None else 2.0 * kl_dist
        self.seed = seed
        self.save_logs = save_logs
        self.FIM_invert_args = FIM_invert_args
        self.hvp_subsample = hvp_sample_frac
        self.running_score = None
        if save_logs: self.logger = DataLog()

    def HVP(self, observations, actions, vector, regu_coef=None):
        """
        Computes the hessian vector product for a set of observations and actions
        :param observations : A numpy nd array of observations 
        :param actions : A numpy nd array of actions
        :param vector : Input vector to multiply the gradients by
        :param regu_coef : It is just a coefficient that makes the Vector Product non singular (positive definite)
        :return : The hessian vector product as a 1d np array
        """
        regu_coef = self.FIM_invert_args[
            'damping'] if regu_coef is None else regu_coef
        vec = Variable(torch.from_numpy(vector).float(), requires_grad=False)
        if self.hvp_subsample is not None and self.hvp_subsample < 0.99:
            num_samples = observations.shape[0]
            rand_idx = np.random.choice(num_samples,
                                        size=int(self.hvp_subsample *
                                                 num_samples))
            obs = observations[rand_idx]
            act = actions[rand_idx]
        else:
            obs = observations
            act = actions
        old_dist_info = self.policy.old_dist_info(obs, act)
        new_dist_info = self.policy.new_dist_info(obs, act)
        mean_kl = self.policy.mean_kl(new_dist_info, old_dist_info)
        grad_fo = torch.autograd.grad(mean_kl,
                                      self.policy.trainable_params,
                                      create_graph=True)
        flat_grad = torch.cat([g.contiguous().view(-1) for g in grad_fo])
        h = torch.sum(flat_grad * vec)
        hvp = torch.autograd.grad(h, self.policy.trainable_params)
        hvp_flat = np.concatenate(
            [g.contiguous().view(-1).data.numpy() for g in hvp])
        return hvp_flat + regu_coef * vector

    def build_Hvp_eval(self, inputs, regu_coef=None):
        """
        Returns a function that computes A*v for a vector v. This is basically needed to use conjugate gradients algorithm
        :param inputs : A list containing [observations, actions]
        :param regu_coef : A coefficient that makes it positive definite/ non singular not sure
        :return : A function that calculates A*v. It will be used in the cg_solve method
        """
        def eval(v):
            full_inp = inputs + [v] + [regu_coef]
            Hvp = self.HVP(*full_inp)
            return Hvp

        return eval

    # ----------------------------------------------------------
    def train_from_paths(self, paths):
        """
        Trains the agent from paths using Natural Policy Gradients
        :param paths: List of dictionaries as taken by the sampler
        :return : Performs the gradient step internally. Returns a list containing [mean_return, std_return, min_return, max_return]
        for the given paths.
        """
        # Concatenate from all the trajectories
        observations = np.concatenate([path["observations"] for path in paths])
        actions = np.concatenate([path["actions"] for path in paths])
        advantages = np.concatenate([path["advantages"] for path in paths])
        # Advantage whitening
        advantages = (advantages - np.mean(advantages)) / (np.std(advantages) +
                                                           1e-6)
        # NOTE : advantage should be zero mean in expectation
        # normalized step size invariant to advantage scaling,
        # but scaling can help with least squares

        # cache renv = e.MinitaurBulletEnv(render=True)
        #cache return distributions for the paths
        path_returns = [sum(p["rewards"]) for p in paths]
        mean_return = np.mean(path_returns)
        std_return = np.std(path_returns)
        min_return = np.amin(path_returns)
        max_return = np.amax(path_returns)
        base_stats = [mean_return, std_return, min_return, max_return]
        self.running_score = mean_return if self.running_score is None else \
                             0.9*self.running_score + 0.1*mean_return  # approx avg of last 10 iters
        if self.save_logs: self.log_rollout_statistics(paths)

        # Keep track of times for various computations
        t_gLL = 0.0
        t_FIM = 0.0

        # Optimization algorithm
        # --------------------------
        reinforce_loss_before = self.compute_reinforce_loss(
            observations, actions, advantages).data.numpy().ravel()[0]

        # VPG
        ts = timer.time()
        vpg_grad = self.flat_vpg(observations, actions, advantages)
        t_gLL += timer.time() - ts

        # NPG
        ts = timer.time()
        hvp = self.build_Hvp_eval([observations, actions],
                                  regu_coef=self.FIM_invert_args['damping'])
        npg_grad = cg_solve(hvp,
                            vpg_grad,
                            x_0=vpg_grad.copy(),
                            cg_iters=self.FIM_invert_args['iters'])
        t_FIM += timer.time() - ts

        # Step size computation
        # --------------------------
        if self.alpha is not None:
            alpha = self.alpha
            n_step_size = (alpha**2) * np.dot(vpg_grad.T, npg_grad)
        else:
            n_step_size = self.n_step_size
            alpha = np.sqrt(
                np.abs(self.n_step_size /
                       (np.dot(vpg_grad.T, npg_grad) + 1e-20)))

        # Policy update
        # --------------------------
        curr_params = self.policy.get_param_values()
        new_params = curr_params + alpha * npg_grad
        self.policy.set_param_values(new_params, set_new=True, set_old=False)
        reinforce_loss_after = self.compute_reinforce_loss(
            observations, actions, advantages).data.numpy().ravel()[0]
        kl_dist = self.kl_old_new(observations,
                                  actions).data.numpy().ravel()[0]
        self.policy.set_param_values(new_params, set_new=True, set_old=True)

        # Log information
        if self.save_logs:
            self.logger.log_kv('alpha', alpha)
            self.logger.log_kv('delta', n_step_size)
            self.logger.log_kv('time_vpg', t_gLL)
            self.logger.log_kv('time_npg', t_FIM)
            self.logger.log_kv('kl_dist', kl_dist)
            self.logger.log_kv('surr_improvement',
                               reinforce_loss_after - reinforce_loss_before)
            self.logger.log_kv('running_score', self.running_score)
            try:
                self.env.env.env.evaluate_success(paths, self.logger)
            except:
                # nested logic for backwards compatibility. TODO: clean this up.
                try:
                    success_rate = self.env.env.env.evaluate_success(paths)
                    self.logger.log_kv('success_rate', success_rate)
                except:
                    pass

        return base_stats