Esempio n. 1
0
    def __init__(self, env, policy, baseline, max_kl):
        """ env     = only structural info of env is used here; 
                      you need to pass the 'mode' to functions of this class
            max_kl  = constraint for determining step-size (suggested: 1e-2 or 5e-3)
        """

        self.policy = policy
        self.env = env
        self.baseline = baseline

        self.optimizer = ConjugateGradientOptimizer(**dict())

        # Define symbolic variables
        self.observations_var = self.env.observation_space.new_tensor_variable(
            'observations', extra_dims=1)
        self.actions_var = self.env.action_space.new_tensor_variable(
            'actions', extra_dims=1)
        self.advantages_var = TT.vector('advantages')

        self.dist = self.policy.distribution

        self.old_dist_info_vars = {
            k: ext.new_tensor('old_%s' % k, ndim=2, dtype=theano.config.floatX)
            for k in self.dist.dist_info_keys
        }
        self.old_dist_info_vars_list = [
            self.old_dist_info_vars[k] for k in self.dist.dist_info_keys
        ]

        self.state_info_vars = {
            k: ext.new_tensor(k, ndim=2, dtype=theano.config.floatX)
            for k in self.policy.state_info_keys
        }
        self.state_info_vars_list = [
            self.state_info_vars[k] for k in self.policy.state_info_keys
        ]

        self.dist_info_vars = self.policy.dist_info_sym(
            self.observations_var, self.state_info_vars)
        # distribution info variable (symbolic) -- interpret as pi
        self.KL = self.dist.kl_sym(self.old_dist_info_vars,
                                   self.dist_info_vars)
        self.LR = self.dist.likelihood_ratio_sym(self.actions_var,
                                                 self.old_dist_info_vars,
                                                 self.dist_info_vars)
        self.mean_KL = TT.mean(self.KL)

        self.surr = -TT.mean(self.LR * self.advantages_var)

        self.input_list = [self.observations_var, self.actions_var, self.advantages_var] + \
                          self.state_info_vars_list + self.old_dist_info_vars_list
        self.optimizer.update_opt(loss=self.surr, target=self.policy, \
                                  leq_constraint=(self.mean_KL, max_kl), \
                                  inputs=self.input_list, constraint_name="mean_kl")
 def __init__(self,
              optimizer=None,
              optimizer_args=None,
              optimizer_low=None,
              **kwargs):
     if optimizer is None:
         if optimizer_args is None:
             optimizer_args = dict()
         optimizer = ConjugateGradientOptimizer(**optimizer_args)
         optimizer_low = ConjugateGradientOptimizer(**optimizer_args)
     super(TRPO, self).__init__(optimizer=optimizer,
                                optimizer_low=optimizer_low,
                                **kwargs)
 def __init__(self, optimizer=None, optimizer_args=None, **kwargs):
     print("init in TRPO_snn")
     if optimizer is None:
         if optimizer_args is None:
             optimizer_args = dict()
         optimizer = ConjugateGradientOptimizer(**optimizer_args)
     super(TRPO_snn, self).__init__(optimizer=optimizer, **kwargs)
Esempio n. 4
0
    def __init__(
            self,
            optimizer=None,
            optimizer_args=None,
            observation_permutation = None,
            action_permutation = None,
            sym_loss_weight = 0.0001,
            action_reg_weight = 0.0,
            **kwargs):
        if optimizer is None:
            if optimizer_args is None:
                optimizer_args = dict()
            optimizer = ConjugateGradientOptimizer(**optimizer_args)
        super(TRPO_Symmetry, self).__init__(optimizer=optimizer, **kwargs)

        self.observation_permutation = observation_permutation
        self.action_permutation = action_permutation
        self.sym_loss_weight = sym_loss_weight
        self.action_reg_weight = action_reg_weight

        self.obs_perm_mat = np.zeros((len(observation_permutation), len(observation_permutation)))
        self.act_per_mat = np.zeros((len(action_permutation), len(action_permutation)))
        for i, perm in enumerate(self.observation_permutation):
            self.obs_perm_mat[i][int(np.abs(perm))] = np.sign(perm)
        for i, perm in enumerate(self.action_permutation):
            self.act_per_mat[i][int(np.abs(perm))] = np.sign(perm)
Esempio n. 5
0
File: trpo.py Progetto: zhmz90/rllab
 def __init__(self, optimizer=None, optimizer_args=None, **kwargs):
     Serializable.quick_init(self, locals())
     if optimizer is None:
         if optimizer_args is None:
             optimizer_args = dict()
         optimizer = ConjugateGradientOptimizer(**optimizer_args)
     super(TRPO, self).__init__(optimizer=optimizer, **kwargs)
Esempio n. 6
0
def run_task(*_):
    # Please note that different environments with different action spaces may
    # require different policies. For example with a Discrete action space, a
    # CategoricalMLPPolicy works, but for a Box action space may need to use
    # a GaussianMLPPolicy (see the trpo_gym_pendulum.py example)
    env = normalize(GymEnv("CartPole-v0", record_video=False, force_reset=True))

    policy = CategoricalMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32)
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=env.horizon,
        n_itr=50,
        discount=0.99,
        step_size=0.01,
        optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5, symmetric=False))
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    algo.train()
Esempio n. 7
0
 def __init__(
         self,
         optimizer=None,
         optimizer_args=None,
         guiding_policies=[],  # guiding policies for training up
         guiding_policy_mps=[],
         guiding_policy_weight=0.0,
         guiding_policy_batch_sizes=[],
         guiding_policy_pool_size=0,
         guiding_policy_sample_size=0,
         **kwargs):
     if optimizer is None:
         if optimizer_args is None:
             optimizer_args = dict()
         optimizer = ConjugateGradientOptimizer(**optimizer_args)
     super(TRPOGuide, self).__init__(optimizer=optimizer, **kwargs)
     self.guiding_policies = guiding_policies
     if len(self.guiding_policies) != 0:
         self.guiding_policy_mps = guiding_policy_mps
         self.guiding_policy_weight = guiding_policy_weight
         self.guiding_policy_batch_sizes = guiding_policy_batch_sizes
         self.guiding_policy_sample_pool = SimpleGuidingSamplePool(
             guiding_policy_pool_size, self.env.observation_space.shape[0],
             self.env.action_space.shape[0])
         self.guiding_policy_sample_size = guiding_policy_sample_size
Esempio n. 8
0
 def __init__(self, optimizer=None, optimizer_args=None, **kwargs):
     if optimizer is None:
         if optimizer_args is None:
             optimizer_args = dict()
         optimizer = ConjugateGradientOptimizer(**optimizer_args)
         #optimizer = PenaltyLbfgsOptimizer(**optimizer_args)
     super(TRPO, self).__init__(optimizer=optimizer, **kwargs)
Esempio n. 9
0
 def __init__(self, optimizer=None, optimizer_args=None, **kwargs):
     n = len(kwargs['policy'].policies)
     if optimizer is None:
         if optimizer_args is None:
             optimizer_args = dict()
         optimizers = [
             ConjugateGradientOptimizer(**optimizer_args) for _ in range(n)
         ]
     super(MultiTRPO, self).__init__(optimizer=optimizers, **kwargs)
Esempio n. 10
0
 def __init__(self, optimizer=None, optimizer_args=None, **kwargs):
     if optimizer is None:
         default_args = dict(max_backtracks=1)
         if optimizer_args is None:
             optimizer_args = default_args
         else:
             optimizer_args = dict(default_args, **optimizer_args)
         optimizer = ConjugateGradientOptimizer(**optimizer_args)
     super(TNPG, self).__init__(optimizer=optimizer, **kwargs)
Esempio n. 11
0
 def __init__(self,
              optimizer=None,
              optimizer_args=None,
              task_num=1,
              **kwargs):
     if optimizer is None:
         if optimizer_args is None:
             optimizer_args = dict()
         optimizer = ConjugateGradientOptimizer(**optimizer_args)
     self.task_num = task_num
     self.kl_weights = np.ones(task_num)
     super(TRPO_MultiTask, self).__init__(optimizer=optimizer, **kwargs)
Esempio n. 12
0
 def __init__(
         self,
         optimizers=None,     #dictionary of optimizers
         optimizer_args=None, #dictionary of optimizer parameters
         agent_names = ['hide', 'seek'], #names of agents
         **kwargs):
     if optimizers is None:
         optimizers = {}
         for name in agent_names:
             if optimizer_args is None:
                 optimizer_args = dict()
             optimizers[name] = ConjugateGradientOptimizer(**optimizer_args)
     super(TRPO, self).__init__(optimizers=optimizers, **kwargs)
Esempio n. 13
0
    def __init__(self, env, policy, baseline, max_kl):
        """ env     = only structural info of env is used here; 
                      you need to pass the 'mode' to functions of this class
            max_kl  = constraint for determining step-size (suggested: 1e-2 or 5e-3)
        """
        
        self.policy     = policy
        self.env        = env
        self.baseline   = baseline

        self.optimizer  = ConjugateGradientOptimizer(**dict())

        # Define symbolic variables
        self.observations_var = self.env.observation_space.new_tensor_variable('observations', extra_dims=1)
        self.actions_var      = self.env.action_space.new_tensor_variable('actions', extra_dims=1)
        self.advantages_var   = TT.vector('advantages')

        self.dist = self.policy.distribution  

        self.old_dist_info_vars = {
            k: ext.new_tensor(
                'old_%s' % k,
                ndim=2,
                dtype=theano.config.floatX
            ) for k in self.dist.dist_info_keys
            }
        self.old_dist_info_vars_list = [self.old_dist_info_vars[k] for k in self.dist.dist_info_keys]

        self.state_info_vars = {
            k: ext.new_tensor(
                k,
                ndim=2,
                dtype=theano.config.floatX
            ) for k in self.policy.state_info_keys
        }
        self.state_info_vars_list = [self.state_info_vars[k] for k in self.policy.state_info_keys]

        self.dist_info_vars = self.policy.dist_info_sym(self.observations_var, self.state_info_vars)   
        # distribution info variable (symbolic) -- interpret as pi
        self.KL = self.dist.kl_sym(self.old_dist_info_vars, self.dist_info_vars)
        self.LR = self.dist.likelihood_ratio_sym(self.actions_var, self.old_dist_info_vars, self.dist_info_vars)
        self.mean_KL = TT.mean(self.KL)
        
        self.surr = - TT.mean(self.LR * self.advantages_var)

        self.input_list = [self.observations_var, self.actions_var, self.advantages_var] + \
                          self.state_info_vars_list + self.old_dist_info_vars_list
        self.optimizer.update_opt(loss=self.surr, target=self.policy, \
                                  leq_constraint=(self.mean_KL, max_kl), \
                                  inputs=self.input_list, constraint_name="mean_kl")
def run_task(*_):
    env = normalize(CartpoleEnv())

    policy = GaussianGRUPolicy(env_spec=env.spec, )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
                batch_size=4000,
                max_path_length=100,
                n_itr=10,
                discount=0.99,
                step_size=0.01,
                optimizer=ConjugateGradientOptimizer(
                    hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)))
    algo.train()
Esempio n. 15
0
    def __init__(self,
                 optimizer=None,
                 optimizer_args=None,
                 aux_pred_step=3,
                 aux_pred_dim=4,
                 pool_batch_size=50000,
                 **kwargs):
        if optimizer is None:
            if optimizer_args is None:
                optimizer_args = dict()
            optimizer = ConjugateGradientOptimizer(**optimizer_args)
        self.pool_batch_size = pool_batch_size
        self.aux_pred_step = aux_pred_step
        super(TRPOAux, self).__init__(optimizer=optimizer, **kwargs)

        self.aux_pred_dim = aux_pred_dim
        self.aux_pred_pool = SimpleReplayPoolAux(
            50000, self.env.observation_space.shape[0] * self.aux_pred_step,
            aux_pred_dim)
Esempio n. 16
0
    def _buildRLAlg(useCG, algDict, optimizerArgs=None):
        #RL Algorithm
        if optimizerArgs is None:
            optimizerArgs = dict()

        if useCG:
            #either use CG optimizer == TRPO
            optimizer = ConjugateGradientOptimizer(**optimizerArgs)
        #or use BFGS optimzier == penalized policy optimization TODO can this be an avenue to PPO? does it not require also liklihood truncation?
        else:
            optimizer = PenaltyLbfgsOptimizer(**optimizerArgs)
        #NPO is expecting in ctor :
        #self.optimizer = optimizer - need to specify this or else defaults to PenaltyLbfgsOptimizer
        #self.step_size = step_size : defaults to 0.01
        #truncate_local_is_ratio means to truncate distribution likelihood ration, which is defined as
        #  lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars)
        # if truncation is not none : lr = TT.minimum(self.truncate_local_is_ratio, lr)
        #self.truncate_local_is_ratio = truncate_local_is_ratio

        algo = NPO(optimizer=optimizer, **algDict)
        return algo
Esempio n. 17
0
    def __init__(self,
                 optimizer=None,
                 optimizer_args=None,
                 mp_dim=2,
                 **kwargs):
        if optimizer is None:
            if optimizer_args is None:
                optimizer_args = dict()
            optimizer = ConjugateGradientOptimizer(**optimizer_args)

        self.mp_dim = mp_dim

        # genearte data for weight entropy related objective terms
        self.base = np.zeros(4)
        ent_input = []
        for i in range(1000):
            ent_input.append(
                np.concatenate([self.base,
                                np.random.random(self.mp_dim)]).tolist())
        self.ent_input = [np.array(ent_input)]

        super(TRPOMPSel, self).__init__(optimizer=optimizer, **kwargs)
Esempio n. 18
0
def run_task(*_):
    # Non-registration of this custom environment is an rllab bug
    # See https://github.com/openai/rllab/issues/68
    # At the moment I'm bypassing this problem by adding the
    # import statement in gym_env.py
    import gym_follower_2d
    import lasagne.nonlinearities as NL

    gymenv = GymEnv(args.env,
                    force_reset=True,
                    record_video=False,
                    record_log=True)
    env = normalize(gymenv)

    logger.log("Training Policy on %s" % args.env)

    policy = GaussianMLPPolicy(env_spec=env.spec,
                               hidden_sizes=(100, 50, 25),
                               hidden_nonlinearity=NL.tanh)

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=args.batch_size,
        max_path_length=100,
        n_itr=args.num_epochs,
        discount=0.99,
        step_size=args.step_size,
        optimizer=ConjugateGradientOptimizer(
            reg_coeff=args.reg_coeff,
            hvp_approach=FiniteDifferenceHvp(base_eps=args.reg_coeff)),
        plot=False,
    )

    algo.train()
Esempio n. 19
0
                                  decent_portion=0.8,
                                  disc_window=2,
                                  iteration=3000,
                                  disc_joints_dim=16,
                                  hidden_sizes=(128, 64, 32))

# baseline
#env = normalize(HumanEnv_v2(discriminator=None), normalize_obs=True)
# GAN imitaion
env = normalize(HumanEnv_v2(discriminator=discriminator), normalize_obs=True)
# print(env.action_space.bounds)
# print(env.observation_space.bounds)

policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(100, 50, 25))

base_line_optimizer = ConjugateGradientOptimizer()
baseline = GaussianMLPBaseline(env.spec,
                               regressor_args={
                                   "mean_network": None,
                                   "hidden_sizes": (100, 50, 25),
                                   "hidden_nonlinearity": NL.tanh,
                                   "optimizer": base_line_optimizer,
                                   "use_trust_region": True,
                                   "step_size": 0.01,
                                   "learn_std": True,
                                   "init_std": 1.0,
                                   "adaptive_std": False,
                                   "std_share_network": False,
                                   "std_hidden_sizes": (32, 32),
                                   "std_nonlinearity": None,
                                   "normalize_inputs": True,
Esempio n. 20
0
from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
from rllab.envs.box2d.cartpole_env import CartpoleEnv
from rllab.envs.normalized_env import normalize
from rllab.policies.gaussian_gru_policy import GaussianGRUPolicy
from rllab.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer, FiniteDifferenceHvp
from rllab.misc.instrument import stub, run_experiment_lite

stub(globals())

env = normalize(CartpoleEnv())

policy = GaussianGRUPolicy(env_spec=env.spec, )

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(env=env,
            policy=policy,
            baseline=baseline,
            batch_size=4000,
            max_path_length=100,
            n_itr=10,
            discount=0.99,
            step_size=0.01,
            optimizer=ConjugateGradientOptimizer(
                hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)))
run_experiment_lite(
    algo.train(),
    n_parallel=1,
    seed=1,
)
Esempio n. 21
0
def run_trpo_vase(env,nRuns = 20,seed_base=0, sigma_c=0.5, ablation_mode=False):

    now = datetime.datetime.now(dateutil.tz.tzlocal())
    timestamp = now.strftime('%Y_%m_%d_%H_%M_%S')

    for seed in range(seed_base,nRuns):

        if env == 'mountaincar':
            mdp = MountainCarEnvX()
            n_itr = 50
            max_path_length = 500
            type = 'classic'
        elif env == 'cartpole':
            mdp = NormalizedEnv(env=CartpoleSwingupEnvX())
            n_itr = 400
            max_path_length = 500
            type = 'classic'
        elif env == 'doublependulum':
            mdp = NormalizedEnv(env=DoublePendulumEnvX())
            n_itr = 400
            max_path_length = 500
            type = 'classic'
        elif env == 'halfcheetah':
            mdp = NormalizedEnv(env=HalfCheetahEnvX())
            n_itr = 600
            max_path_length = 500
            type = 'locomotion'
        elif env == 'ant':
            mdp = NormalizedEnv(env=AntEnv())
            n_itr = 600
            max_path_length = 500
            type = 'locomotion'
        elif env == 'lunarlander':
            mdp = NormalizedEnv(env=LunarLanderContinuous())
            n_itr = 100
            max_path_length = 1000
            type = 'classic'
        else:
            sys.stderr.write("Error! Environment '%s' not recognised\n" % env)
            sys.exit(-1)

        if type == 'classic':
            step_size = 0.01
            replay_pool_size = 100000
            policy_hidden_sizes = (32,)
            unn_n_hidden = [32]
            unn_layers_type=[1, 1]

            baseline = GaussianMLPBaseline(
                env_spec=mdp.spec,
                regressor_args={
                    'hidden_sizes': (32,),
                    'learn_std': False,
                    'hidden_nonlinearity': NL.rectify,
                    'optimizer': ConjugateGradientOptimizer(subsample_factor=1.0)
                }
            )
        else:
            step_size = 0.05
            replay_pool_size = 5000000
            policy_hidden_sizes = (64, 32)
            unn_n_hidden = [64, 64]
            unn_layers_type=[1, 1, 1]

            baseline = LinearFeatureBaseline(
                mdp.spec,
            )

        policy = GaussianMLPPolicy(
            env_spec=mdp.spec,
            hidden_sizes=policy_hidden_sizes,
            hidden_nonlinearity=NL.tanh
        )


        algo = TRPO(
            env=mdp,
            policy=policy,
            baseline=baseline,
            n_itr=n_itr,
            batch_size=5000,
            max_path_length = max_path_length,
            discount = 0.995,
            gae_lambda = 0.95,
            whole_paths=True,
            step_size=step_size,
            eta=1e-4,
            snn_n_samples=10,
            prior_sd=0.5,
            likelihood_sd=sigma_c,
            subsample_factor=1.0,
            use_replay_pool=True,
            replay_pool_size=replay_pool_size,
            n_updates_per_sample=500,
            unn_n_hidden=unn_n_hidden,
            unn_layers_type=unn_layers_type,
            unn_learning_rate=0.001
        )

        exp_name = "trpo-vase_%s_%04d" % (timestamp, seed + 1)
        if ablation_mode:
            cwd = os.getcwd()
            log_dir = cwd + "/data/local/sigmas/" + env + ("/%.3f/" % sigma_c) + exp_name
        else:
            log_dir = config.LOG_DIR + "/local/" + env +  "/" + exp_name

        run_experiment_lite(
            algo.train(),
            exp_name = exp_name,
            log_dir= log_dir,
            n_parallel=0,
            snapshot_mode="last",
            seed=seed,
            mode="local",
            script="sandbox/vase/experiments/run_experiment_lite.py"
        )
Esempio n. 22
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--resume_from', type=str)
    parser.add_argument('--encoding_levels', type=int, nargs='+')
    parser.add_argument('--num_encoding_levels', type=int, default=5)
    parser.add_argument('--conv_filters',
                        nargs='*',
                        type=int,
                        default=[16, 16])
    parser.add_argument('--conv_filter_sizes',
                        nargs='*',
                        type=int,
                        default=[4, 4])
    parser.add_argument('--conv_strides', nargs='*', type=int, default=[2, 2])
    parser.add_argument('--hidden_sizes',
                        nargs='*',
                        type=int,
                        default=[32, 32])
    parser.add_argument('--init_std', type=float, default=1.0)
    parser.add_argument('--n_itr', type=int, default=500)
    parser.add_argument('--step_size', type=float, default=0.01)
    parser.add_argument('--batch_size', type=int, default=4000)
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--custom_local_flags', type=str, default=None)
    args = parser.parse_args()

    np.random.seed(args.seed)

    env = SimpleQuadPanda3dEnv(action_space=TranslationAxisAngleSpace(
        low=[-10., -10., -10., -1.5707963267948966],
        high=[10., 10., 10., 1.5707963267948966],
        axis=[0., 0., 1.]),
                               sensor_names=['image'],
                               camera_size=[256, 256],
                               camera_hfov=26.007823885645635,
                               car_env_class=GeometricCarPanda3dEnv,
                               car_action_space=BoxSpace(low=[0., 0.],
                                                         high=[0., 0.]),
                               car_model_names=[
                                   'mazda6', 'chevrolet_camaro',
                                   'nissan_gt_r_nismo',
                                   'lamborghini_aventador', 'golf5'
                               ],
                               dt=0.1)
    env = ServoingEnv(env)
    transformers = {
        'image':
        CompositionTransformer([
            ImageTransformer(scale_size=0.5),
            OpsTransformer(transpose=(2, 0, 1))
        ]),
        'action':
        NormalizerTransformer(space=env.action_space)
    }
    env = RllabEnv(env, transformers=transformers)
    env = normalize(env)

    assert len(args.conv_filters) == len(args.conv_filter_sizes)
    assert len(args.conv_filters) == len(args.conv_strides)
    network_kwargs = dict(encoding_levels=args.encoding_levels,
                          num_encoding_levels=args.num_encoding_levels,
                          conv_filters=args.conv_filters,
                          conv_filter_sizes=args.conv_filter_sizes,
                          conv_strides=args.conv_strides,
                          conv_pads=[0] * len(args.conv_filters),
                          hidden_sizes=args.hidden_sizes,
                          hidden_nonlinearity=LN.rectify,
                          output_nonlinearity=None,
                          name="mean_network")
    mean_network = VggConvNetwork(input_shape=env.observation_space.shape,
                                  output_dim=env.action_space.flat_dim,
                                  **network_kwargs)

    policy = GaussianConvPolicy(
        env_spec=env.spec,
        init_std=args.init_std,
        mean_network=mean_network,
    )

    conv_baseline_kwargs = dict(
        env_spec=env.spec,
        regressor_args=dict(
            mean_network=VggConvNetwork(
                input_shape=env.observation_space.shape,
                output_dim=1,
                **network_kwargs),
            use_trust_region=True,
            step_size=args.step_size,
            normalize_inputs=True,
            normalize_outputs=True,
            hidden_sizes=None,
            conv_filters=None,
            conv_filter_sizes=None,
            conv_strides=None,
            conv_pads=None,
            batchsize=200,
            optimizer=PenaltyLbfgsOptimizer(n_slices=50),
        ))
    baseline = GaussianConvBaseline(**conv_baseline_kwargs)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=args.batch_size,
        max_path_length=100,
        n_itr=args.n_itr,
        discount=0.9,
        step_size=args.step_size,
        optimizer=ConjugateGradientOptimizer(num_slices=50),
    )

    if args.resume_from:
        run_experiment_lite(algo.train(),
                            snapshot_mode='gap',
                            snapshot_gap=10,
                            seed=args.seed,
                            custom_local_flags=args.custom_local_flags,
                            resume_from=args.resume_from)
    else:
        run_experiment_lite(algo.train(),
                            snapshot_mode='gap',
                            snapshot_gap=10,
                            seed=args.seed,
                            custom_local_flags=args.custom_local_flags)
def run_task(v):
    expDict = v
    ###############################
    #Env
    if (expDict['isNormalized']):
        if (expDict['isGymEnv']):
            env = normalize(
                GymEnv(expDict['envName'],
                       record_video=False,
                       record_log=False))
        else:
            env = normalize(expDict['envName'])
        #if env is normalized then it is wrapped
        #dartEnv = env.wrapped_env.env.unwrapped
    else:  #if not normalized, needs to be gym environment
        env = GymEnv(expDict['envName'], record_video=False, record_log=False)

    if (expDict['blType'] == 'linear'):
        bl = LinearFeatureBaseline(env_spec=env.spec)
    elif (expDict['blType'] == 'MLP'):
        #use regressor_args as dict to define regressor arguments like layers
        regArgs = dict()
        regArgs['hidden_sizes'] = expDict['blMlpArch']
        #only used if adaptive_std == True
        regArgs['std_hidden_sizes'] = expDict['blMlpArch']
        #defaults to normalizing
        regArgs['normalize_inputs'] = False
        regArgs['normalize_outputs'] = False
        #regArgs['adaptive_std'] = True
        #regArgs['learn_std']= False  #ignored if adaptive_std == true - sets global value which is required for all thread instances
        bl = GaussianMLPBaseline(env_spec=env.spec, regressor_args=regArgs)
    else:
        print('unknown baseline type : ' + expDict['blType'])
        bl = None

    ###############################
    #Policy
    pol = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=expDict[
            'polNetArch']  #must be tuple - if only 1 value should be followed by comma i.e. (8,)
    )

    ###############################
    #RL Algorithm

    #allow for either trpo or ppo
    optimizerArgs = expDict['optimizerArgs']
    if optimizerArgs is None: optimizerArgs = dict()

    if expDict['useCG']:
        #either use CG optimizer == TRPO
        optimizer = ConjugateGradientOptimizer(**optimizerArgs)
        print('Using CG optimizer (TRPO)')
    #or use BFGS optimzier -> ppo? not really
    else:
        optimizer = PenaltyLbfgsOptimizer(**optimizerArgs)
        print('Using LBFGS optimizer (PPO-like ?)')
    #NPO is expecting in ctor :
    #self.optimizer = optimizer - need to specify this or else defaults to PenaltyLbfgsOptimizer
    #self.step_size = step_size : defaults to 0.01
    #truncate_local_is_ratio means to truncate distribution likelihood ration, which is defined as
    #  lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars)
    # if truncation is not none : lr = TT.minimum(self.truncate_local_is_ratio, lr)
    #self.truncate_local_is_ratio = truncate_local_is_ratio
    algo = NPO(optimizer=optimizer,
               env=env,
               policy=pol,
               baseline=bl,
               batch_size=int(expDict['numBatches']),
               whole_paths=True,
               gae_lambda=float(expDict['gae_lambda']),
               max_path_length=int(expDict['maxPathLength']),
               n_itr=int(expDict['numIters']),
               discount=0.99,
               step_size=0.01,
               start_itr=1)

    algo.train()
Esempio n. 24
0
def run_trpo(env, nRuns=20, seed_base=0):

    now = datetime.datetime.now(dateutil.tz.tzlocal())
    timestamp = now.strftime('%Y_%m_%d_%H_%M_%S')

    for seed in range(seed_base, nRuns):

        if env == 'mountaincar':
            mdp = MountainCarEnvX()
            n_itr = 50
            max_path_length = 500
            type = 'classic'
        elif env == 'cartpole':
            mdp = NormalizedEnv(env=CartpoleSwingupEnvX())
            n_itr = 400
            max_path_length = 500
            type = 'classic'
        elif env == 'doublependulum':
            mdp = NormalizedEnv(env=DoublePendulumEnvX())
            n_itr = 400
            max_path_length = 500
            type = 'classic'
        elif env == 'halfcheetah':
            mdp = NormalizedEnv(env=HalfCheetahEnvX())
            n_itr = 600
            max_path_length = 500
            type = 'locomotion'
        elif env == 'ant':
            mdp = NormalizedEnv(env=AntEnv())
            n_itr = 600
            max_path_length = 500
            type = 'locomotion'
        elif env == 'lunarlander':
            mdp = NormalizedEnv(env=LunarLanderContinuous())
            n_itr = 100
            max_path_length = 1000
            type = 'classic'
        else:
            sys.stderr.write("Error! Environment '%s' not recognised\n" % env)
            sys.exit(-1)

        if type == 'classic':
            step_size = 0.01
            policy_hidden_sizes = (32, )

            baseline = GaussianMLPBaseline(
                env_spec=mdp.spec,
                regressor_args={
                    'hidden_sizes': (32, ),
                    'learn_std': False,
                    'hidden_nonlinearity': NL.rectify,
                    'optimizer':
                    ConjugateGradientOptimizer(subsample_factor=1.0)
                })
        else:
            step_size = 0.05
            policy_hidden_sizes = (64, 32)

            baseline = LinearFeatureBaseline(mdp.spec, )

        policy = GaussianMLPPolicy(env_spec=mdp.spec,
                                   hidden_sizes=policy_hidden_sizes,
                                   hidden_nonlinearity=NL.tanh)

        algo = TRPO(
            env=mdp,
            policy=policy,
            baseline=baseline,
            batch_size=5000,
            whole_paths=True,
            max_path_length=max_path_length,
            n_itr=n_itr,
            step_size=step_size,
            subsample_factor=1.0,
        )

        exp_name = "trpo_%s_%04d" % (timestamp, seed + 1)
        log_dir = config.LOG_DIR + "/local/" + env + "/" + exp_name

        run_experiment_lite(algo.train(),
                            exp_name=exp_name,
                            log_dir=log_dir,
                            n_parallel=0,
                            snapshot_mode="last",
                            seed=seed,
                            mode="local")
Esempio n. 25
0
class TRPO:
   
    def __init__(self, env, policy, baseline, max_kl):
        """ env     = only structural info of env is used here; 
                      you need to pass the 'mode' to functions of this class
            max_kl  = constraint for determining step-size (suggested: 1e-2 or 5e-3)
        """
        
        self.policy     = policy
        self.env        = env
        self.baseline   = baseline

        self.optimizer  = ConjugateGradientOptimizer(**dict())

        # Define symbolic variables
        self.observations_var = self.env.observation_space.new_tensor_variable('observations', extra_dims=1)
        self.actions_var      = self.env.action_space.new_tensor_variable('actions', extra_dims=1)
        self.advantages_var   = TT.vector('advantages')

        self.dist = self.policy.distribution  

        self.old_dist_info_vars = {
            k: ext.new_tensor(
                'old_%s' % k,
                ndim=2,
                dtype=theano.config.floatX
            ) for k in self.dist.dist_info_keys
            }
        self.old_dist_info_vars_list = [self.old_dist_info_vars[k] for k in self.dist.dist_info_keys]

        self.state_info_vars = {
            k: ext.new_tensor(
                k,
                ndim=2,
                dtype=theano.config.floatX
            ) for k in self.policy.state_info_keys
        }
        self.state_info_vars_list = [self.state_info_vars[k] for k in self.policy.state_info_keys]

        self.dist_info_vars = self.policy.dist_info_sym(self.observations_var, self.state_info_vars)   
        # distribution info variable (symbolic) -- interpret as pi
        self.KL = self.dist.kl_sym(self.old_dist_info_vars, self.dist_info_vars)
        self.LR = self.dist.likelihood_ratio_sym(self.actions_var, self.old_dist_info_vars, self.dist_info_vars)
        self.mean_KL = TT.mean(self.KL)
        
        self.surr = - TT.mean(self.LR * self.advantages_var)

        self.input_list = [self.observations_var, self.actions_var, self.advantages_var] + \
                          self.state_info_vars_list + self.old_dist_info_vars_list
        self.optimizer.update_opt(loss=self.surr, target=self.policy, \
                                  leq_constraint=(self.mean_KL, max_kl), \
                                  inputs=self.input_list, constraint_name="mean_kl")


    def train(self, N, T, gamma, niter, env_mode='train'):
        """    N = number of trajectories
               T = horizon
               niter = number of iterations to update the policy
               env_mode = can be 'train', 'test' or something else. 
                          You need to write the appropriate function in MDP_funcs
        """        

        eval_statistics = []
        for iter in range(niter):
            curr_iter_stats = self.train_step(N, T, gamma, env_mode)
            eval_statistics.append(curr_iter_stats)

        return eval_statistics


    def train_step(self, N, T, gamma, env_mode='train', 
        num_cpu='max',
        save_paths=False,
        idx=None,
        mujoco_env=True, 
        normalized_env=False,
        sub_sample=None,
        train_env=None):
        """    N = number of trajectories
               T = horizon
               env_mode = can be 'train', 'test' or something else. 
                          You need to write the appropriate function in MDP_funcs
        """
        
        
        if train_env == None:
            paths = sample_paths_parallel(N, self.policy, self.baseline, env_mode, 
                T, gamma, num_cpu=num_cpu, mujoco_env=mujoco_env, normalized_env=normalized_env)
        else:
            paths = sample_paths(N, self.policy, self.baseline, env=train_env, T=T, gamma=gamma,
                mujoco_env=mujoco_env, normalized_env=normalized_env)

        # save the paths used to make the policy update
        if save_paths == True and idx != None:
            robustRL.utils.save_paths(paths, idx)

        eval_statistics = self.train_from_paths(paths, sub_sample=sub_sample)
        eval_statistics[0].append(N)

        return eval_statistics


    def train_from_paths(self, paths, sub_sample=None, path_percentile=[10,15,33,50,66,85,90]):
        
        if sub_sample != None:
        	# Pick subset of paths whose returns are in the sub_sample percentile range
        	path_returns = [sum(p["rewards"]) for p in paths]
        	sub_range = [np.percentile(path_returns, sub_sample[i]) for i in range(2)]
        	# Find paths which satisfy criteria
        	idx = [i for i,ret in enumerate(path_returns) if sub_range[0]<=ret and ret<=sub_range[1]]
        	chosen_paths = [paths[i] for i in idx]
        else:
        	chosen_paths = paths

        self.baseline.fit(paths)
        # concatenate from all the trajectories
        observations = tensor_utils.concat_tensor_list([path["observations"] for path in chosen_paths])
        actions      = tensor_utils.concat_tensor_list([path["actions"] for path in chosen_paths])
        rewards      = tensor_utils.concat_tensor_list([path["rewards"] for path in chosen_paths])
        advantages   = tensor_utils.concat_tensor_list([path["advantages"] for path in chosen_paths])
        env_infos    = tensor_utils.concat_tensor_dict_list([path["env_infos"] for path in chosen_paths])
        agent_infos  = tensor_utils.concat_tensor_dict_list([path["agent_infos"] for path in chosen_paths])

        samples_data = dict(
            observations=observations,
            actions=actions,
            rewards=rewards,
            advantages=advantages,
            env_infos=env_infos,
            agent_infos=agent_infos,
        )
        
        all_input_values = tuple(ext.extract(
            samples_data,
            "observations", "actions", "advantages"
        ))
        agent_infos = samples_data["agent_infos"]
        state_info_list = [agent_infos[k] for k in self.policy.state_info_keys]
        dist_info_list = [agent_infos[k] for k in self.policy.distribution.dist_info_keys]
        all_input_values += tuple(state_info_list) + tuple(dist_info_list)
        
        # Take a step with optimizer
        self.optimizer.optimize(all_input_values)

        # cache return distributions for the paths
        path_returns = [sum(p["rewards"]) for p in paths]
        mean_return  = np.mean(path_returns)
        std_return   = np.std(path_returns)
        min_return   = np.amin(path_returns)
        max_return   = np.amax(path_returns)
        sub_mean     = np.mean([sum(p["rewards"]) for p in chosen_paths])

        base_stats = [mean_return, std_return, min_return, max_return, sub_mean]
        percetile_stats = []
        for p in path_percentile:
            percetile_stats.append(np.percentile(path_returns, p))

        return [base_stats, percetile_stats]
Esempio n. 26
0
            env_spec=env.spec,
            hidden_sizes=(32, ),
        )

        baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            regressor_args={
                'hidden_sizes': (32, ),
                'hidden_nonlinearity':
                NL.tanh,
                'learn_std':
                False,
                'step_size':
                0.01,
                'optimizer':
                ConjugateGradientOptimizer(
                    subsample_factor=trpo_subsample_factor)
            })

    elif task_type == 'locomotion':

        trpo_max_path_length = 500
        trpo_batch_size = 5000
        trpo_subsample_factor = 1
        trpo_step_size = 0.05
        expl_lambda = 0.001

        policy = GaussianMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=(64, 32),
        )