Ejemplo n.º 1
0
 def __init__(self, desc='two-state', map_id=None):
     self._map_id = map_id
     Serializable.quick_init(self, locals())
     if isinstance(desc, str):
         desc = MAPS[desc]
     self.desc_choices = desc
     self.reset()
Ejemplo n.º 2
0
    def __init__(
            self,
            update_method=lasagne.updates.adam,
            learning_rate=1e-3,
            max_epochs=1000,
            tolerance=1e-6,
            batch_size=32,
            callback=None,
            verbose=False,
            **kwargs):
        """

        :param max_epochs:
        :param tolerance:
        :param update_method:
        :param batch_size: None or an integer. If None the whole dataset will be used.
        :param callback:
        :param kwargs:
        :return:
        """
        Serializable.quick_init(self, locals())
        self._opt_fun = None
        self._target = None
        self._callback = callback
        update_method = partial(update_method, learning_rate=learning_rate)
        self._update_method = update_method
        self._max_epochs = max_epochs
        self._tolerance = tolerance
        self._batch_size = batch_size
        self._verbose = verbose
Ejemplo n.º 3
0
    def __init__(
            self,
            epsilon=0.5,
            L2_reg_dual=0.,  # 1e-5,
            L2_reg_loss=0.,
            max_opt_itr=50,
            optimizer=scipy.optimize.fmin_l_bfgs_b,
            **kwargs):
        """

        :param epsilon: Max KL divergence between new policy and old policy.
        :param L2_reg_dual: Dual regularization
        :param L2_reg_loss: Loss regularization
        :param max_opt_itr: Maximum number of batch optimization iterations.
        :param optimizer: Module path to the optimizer. It must support the same interface as
        scipy.optimize.fmin_l_bfgs_b.
        :return:
        """
        Serializable.quick_init(self, locals())
        super(REPS, self).__init__(**kwargs)
        self.epsilon = epsilon
        self.L2_reg_dual = L2_reg_dual
        self.L2_reg_loss = L2_reg_loss
        self.max_opt_itr = max_opt_itr
        self.optimizer = optimizer
        self.opt_info = None
Ejemplo n.º 4
0
    def __init__(self, env_spec, hidden_sizes=(32, 32), hidden_nonlinearity=NL.tanh, prob_network=None):
        """
        :param env_spec: A spec for the mdp.
        :param hidden_sizes: list of sizes for the fully connected hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param prob_network: manually specified network for this policy, other network params
        are ignored
        :return:
        """
        Serializable.quick_init(self, locals())

        assert isinstance(env_spec.action_space, Discrete)

        if prob_network is None:
            prob_network = MLP(
                input_shape=(env_spec.observation_space.flat_dim,),
                output_dim=env_spec.action_space.n,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=NL.softmax,
            )

        self._l_prob = prob_network.output_layer
        self._l_obs = prob_network.input_layer
        self._f_prob = ext.compile_function(
            [prob_network.input_layer.input_var], L.get_output(prob_network.output_layer)
        )

        self._dist = Categorical(env_spec.action_space.n)

        super(CategoricalMLPPolicy, self).__init__(env_spec)
        LasagnePowered.__init__(self, [prob_network.output_layer])
Ejemplo n.º 5
0
    def __init__(self, env, ma_mode):
        Serializable.quick_init(self, locals())

        self.env = env
        if hasattr(env, 'id'):
            self.env_id = env.id
        else:
            self.env_id = 'MA-Wrapper-v0'

        if ma_mode == 'centralized':
            obsfeat_space = convert_gym_space(env.agents[0].observation_space,
                                              n_agents=len(env.agents))
            action_space = convert_gym_space(env.agents[0].action_space, n_agents=len(env.agents))
        elif ma_mode in ['decentralized', 'concurrent']:
            obsfeat_space = convert_gym_space(env.agents[0].observation_space, n_agents=1)
            action_space = convert_gym_space(env.agents[0].action_space, n_agents=1)

        else:
            raise NotImplementedError

        self._observation_space = obsfeat_space
        self._action_space = action_space
        if hasattr(env, 'timestep_limit'):
            self._horizon = env.timestep_limit
        else:
            self._horizon = 250
    def __init__(
        self, cg_iters=10, reg_coeff=1e-5, subsample_factor=0.1, backtrack_ratio=0.8, max_backtracks=15, debug_nan=False
    ):
        """

        :param cg_iters: The number of CG iterations used to calculate A^-1 g
        :param reg_coeff: A small value so that A -> A + reg*I
        :param subsample_factor: Subsampling factor to reduce samples when using "conjugate gradient. Since the
        computation time for the descent direction dominates, this can greatly reduce the overall computation time.
        :param debug_nan: if set to True, NanGuard will be added to the compilation, and ipdb will be invoked when
        nan is detected
        :return:
        """
        Serializable.quick_init(self, locals())
        self._cg_iters = cg_iters
        self._reg_coeff = reg_coeff
        self._subsample_factor = subsample_factor
        self._backtrack_ratio = backtrack_ratio
        self._max_backtracks = max_backtracks

        self._opt_fun = None
        self._target = None
        self._max_constraint_val = None
        self._constraint_name = None
        self._debug_nan = debug_nan
Ejemplo n.º 7
0
 def __init__(self, regressors):
     """
     :param regressors: List of individual regressors
     """
     Serializable.quick_init(self, locals())
     self.regressors = regressors
     self.output_dims = [x.output_dim for x in regressors]
Ejemplo n.º 8
0
 def __init__(self, mdp_cls, mdp_args):
     Serializable.quick_init(self, locals())
     self.mdp_cls = mdp_cls
     self.mdp_args = dict(mdp_args)
     self.mdp_args["template_args"] = dict(noise=True)
     mdp = self.gen_mdp()
     super(IdentificationEnv, self).__init__(mdp)
Ejemplo n.º 9
0
 def __init__(
         self,
         env,
         policy,
         n_itr=500,
         max_path_length=500,
         discount=0.99,
         sigma0=1.,
         batch_size=None,
         plot=False,
         **kwargs
 ):
     """
     :param n_itr: Number of iterations.
     :param max_path_length: Maximum length of a single rollout.
     :param batch_size: # of samples from trajs from param distribution, when this
     is set, n_samples is ignored
     :param discount: Discount.
     :param plot: Plot evaluation run after each iteration.
     :param sigma0: Initial std for param dist
     :return:
     """
     Serializable.quick_init(self, locals())
     self.env = env
     self.policy = policy
     self.plot = plot
     self.sigma0 = sigma0
     self.discount = discount
     self.max_path_length = max_path_length
     self.n_itr = n_itr
     self.batch_size = batch_size
Ejemplo n.º 10
0
 def __init__(self, obj, method_name, args, kwargs):
     self._serializable_initialized = False
     Serializable.quick_init(self, locals())
     self.obj = obj
     self.method_name = method_name
     self.args = args
     self.kwargs = kwargs
Ejemplo n.º 11
0
    def __init__(self, env_name, record_video=True, video_schedule=None, log_dir=None, record_log=True,
                 force_reset=False):
        if log_dir is None:
            if logger.get_snapshot_dir() is None:
                logger.log("Warning: skipping Gym environment monitoring since snapshot_dir not configured.")
            else:
                log_dir = os.path.join(logger.get_snapshot_dir(), "gym_log")
        Serializable.quick_init(self, locals())

        env = gym.envs.make(env_name)
        self.env = env
        self.env_id = env.spec.id

        monitor_manager.logger.setLevel(logging.WARNING)

        assert not (not record_log and record_video)

        if log_dir is None or record_log is False:
            self.monitoring = False
        else:
            if not record_video:
                video_schedule = NoVideoSchedule()
            else:
                if video_schedule is None:
                    video_schedule = CappedCubicVideoSchedule()
            self.env = gym.wrappers.Monitor(self.env, log_dir, video_callable=video_schedule, force=True)
            self.monitoring = True

        self._observation_space = convert_gym_space(env.observation_space)
        self._action_space = convert_gym_space(env.action_space)
        self._horizon = env.spec.timestep_limit
        self._log_dir = log_dir
        self._force_reset = force_reset
Ejemplo n.º 12
0
 def __init__(self, name, max_opt_itr=20, callback=None):
     Serializable.quick_init(self, locals())
     self._name = name
     self._max_opt_itr = max_opt_itr
     self._opt_fun = None
     self._target = None
     self._callback = callback
Ejemplo n.º 13
0
 def __init__(
         self,
         ctrl_cost_coeff=1e-2,
         *args, **kwargs):
     self.ctrl_cost_coeff = ctrl_cost_coeff
     super(SwimmerEnv, self).__init__(*args, **kwargs)
     Serializable.quick_init(self, locals())
    def __init__(
            self,
            name,
            max_opt_itr=20,
            initial_penalty=1.0,
            min_penalty=1e-2,
            max_penalty=1e6,
            increase_penalty_factor=2,
            decrease_penalty_factor=0.5,
            max_penalty_itr=10,
            adapt_penalty=True):
        Serializable.quick_init(self, locals())
        self._name = name
        self._max_opt_itr = max_opt_itr
        self._penalty = initial_penalty
        self._initial_penalty = initial_penalty
        self._min_penalty = min_penalty
        self._max_penalty = max_penalty
        self._increase_penalty_factor = increase_penalty_factor
        self._decrease_penalty_factor = decrease_penalty_factor
        self._max_penalty_itr = max_penalty_itr
        self._adapt_penalty = adapt_penalty

        self._opt_fun = None
        self._target = None
        self._max_constraint_val = None
        self._constraint_name = None
Ejemplo n.º 15
0
 def __init__(self,
              env,
              obs_noise=1e-1,
              ):
     super(NoisyObservationEnv, self).__init__(env)
     Serializable.quick_init(self, locals())
     self.obs_noise = obs_noise
Ejemplo n.º 16
0
    def __init__(self, goal_reward=10, actuation_cost_coeff=30,
                 distance_cost_coeff=1, init_sigma=0.1):
        super().__init__()
        Serializable.quick_init(self, locals())

        self.dynamics = PointDynamics(dim=2, sigma=0)
        self.init_mu = np.zeros(2, dtype=np.float32)
        self.init_sigma = init_sigma
        self.goal_positions = np.array(
            [
                [5, 0],
                [-5, 0],
                [0, 5],
                [0, -5]
            ],
            dtype=np.float32
        )
        self.goal_threshold = 1.
        self.goal_reward = goal_reward
        self.action_cost_coeff = actuation_cost_coeff
        self.distance_cost_coeff = distance_cost_coeff
        self.xlim = (-7, 7)
        self.ylim = (-7, 7)
        self.vel_bound = 1.
        self.reset()
        self.observation = None

        self._ax = None
        self._env_lines = []
        self.fixed_plots = None
        self.dynamic_plots = []
Ejemplo n.º 17
0
 def __init__(
         self,
         observation_space,
         action_space):
     Serializable.quick_init(self, locals())
     self._observation_space = observation_space
     self._action_space = action_space
Ejemplo n.º 18
0
    def __init__(self, env_spec, obs_pl, action, scope_name=None):
        Serializable.quick_init(self, locals())

        self._obs_pl = obs_pl
        self._action = action
        self._scope_name = (tf.get_variable_scope().name
                            if not scope_name else scope_name)
        super(NNPolicy, self).__init__(env_spec)
Ejemplo n.º 19
0
 def __init__(
         self,
         ctrl_cost_coeff=1e-2,
         *args, **kwargs):
     self.ctrl_cost_coeff = ctrl_cost_coeff
     self._goal_vel = None
     super(SwimmerRandGoalEnv, self).__init__(*args, **kwargs)
     Serializable.quick_init(self, locals())
Ejemplo n.º 20
0
 def __init__(self, env_spec, max_sigma=1.0, min_sigma=0.1, decay_period=1000000):
     assert isinstance(env_spec.action_space, Box)
     assert len(env_spec.action_space.shape) == 1
     Serializable.quick_init(self, locals())
     self._max_sigma = max_sigma
     self._min_sigma = min_sigma
     self._decay_period = decay_period
     self._action_space = env_spec.action_space
Ejemplo n.º 21
0
    def __init__(self, *inputs, name, hidden_layer_sizes):
        Parameterized.__init__(self)
        Serializable.quick_init(self, locals())

        self._name = name
        self._inputs = inputs
        self._layer_sizes = list(hidden_layer_sizes) + [1]

        self._output = self._output_for(*self._inputs)
Ejemplo n.º 22
0
 def __init__(self,
              env,
              action_delay=3,
              ):
     assert action_delay > 0, "Should not use this env transformer"
     super(DelayedActionEnv, self).__init__(env)
     Serializable.quick_init(self, locals())
     self.action_delay = action_delay
     self._queued_actions = None
Ejemplo n.º 23
0
 def __init__(self, max_opt_itr=20, batch_size=32, cg_batch_size=100, callback=None):
     Serializable.quick_init(self, locals())
     self._max_opt_itr = max_opt_itr
     self._opt_fun = None
     self._target = None
     self._batch_size = batch_size
     self._cg_batch_size = cg_batch_size
     self._hf_optimizer = None
     self._callback = callback
Ejemplo n.º 24
0
 def __init__(
         self,
         alive_coeff=1,
         ctrl_cost_coeff=0.01,
         *args, **kwargs):
     self.alive_coeff = alive_coeff
     self.ctrl_cost_coeff = ctrl_cost_coeff
     super(HopperEnv, self).__init__(*args, **kwargs)
     Serializable.quick_init(self, locals())
Ejemplo n.º 25
0
    def __init__(self, desc_str='4x4', max_traj_length=10, goal_reward=10.0):
        Serializable.quick_init(self, locals())
        self.desc_str = desc_str # Map will be loaded in `self.reset`
        self.max_traj_length = max_traj_length

        self.n_row, self.n_col = np.array(map(list, self._fetch_map())).shape

        self.state = None
        self.goal_reward = goal_reward
Ejemplo n.º 26
0
    def __init__(
            self,
            env_spec,
            hidden_sizes=(32, 32),
            hidden_nonlinearity=tf.nn.relu,
            action_merge_layer=-2,
            output_nonlinearity=None,
            bn=False):
        Serializable.quick_init(self, locals())

        l_obs = L.InputLayer(shape=(None, env_spec.observation_space.flat_dim), name="obs")
        l_action = L.InputLayer(shape=(None, env_spec.action_space.flat_dim), name="actions")

        n_layers = len(hidden_sizes) + 1

        if n_layers > 1:
            action_merge_layer = \
                (action_merge_layer % n_layers + n_layers) % n_layers
        else:
            action_merge_layer = 1

        l_hidden = l_obs

        for idx, size in enumerate(hidden_sizes):
            if bn:
                l_hidden = batch_norm(l_hidden)

            if idx == action_merge_layer:
                l_hidden = L.ConcatLayer([l_hidden, l_action])

            l_hidden = L.DenseLayer(
                l_hidden,
                num_units=size,
                nonlinearity=hidden_nonlinearity,
                name="h%d" % (idx + 1)
            )

        if action_merge_layer == n_layers:
            l_hidden = L.ConcatLayer([l_hidden, l_action])

        l_output = L.DenseLayer(
            l_hidden,
            num_units=1,
            nonlinearity=output_nonlinearity,
            name="output"
        )

        output_var = L.get_output(l_output, deterministic=True)

        self._f_qval = tensor_utils.compile_function([l_obs.input_var, l_action.input_var], output_var)
        self._output_layer = l_output
        self._obs_layer = l_obs
        self._action_layer = l_action
        self._output_nonlinearity = output_nonlinearity

        LayersPowered.__init__(self, [l_output])
Ejemplo n.º 27
0
 def __init__(self, env_spec, mu=0, theta=0.15, sigma=0.3, **kwargs):
     assert isinstance(env_spec.action_space, Box)
     assert len(env_spec.action_space.shape) == 1
     Serializable.quick_init(self, locals())
     self.mu = mu
     self.theta = theta
     self.sigma = sigma
     self.action_space = env_spec.action_space
     self.state = np.ones(self.action_space.flat_dim) * self.mu
     self.reset()
Ejemplo n.º 28
0
 def __init__(self, desc='4x4'):
     Serializable.quick_init(self, locals())
     if isinstance(desc, basestring):
         desc = MAPS[desc]
     self.desc = desc = np.array(map(list, desc))
     self.n_row, self.n_col = desc.shape
     (start_x,), (start_y,) = np.nonzero(desc == 'S')
     self.start_state = start_x * self.n_col + start_y
     self.state = None
     self.domain_fig = None
Ejemplo n.º 29
0
 def __init__(
         self,
         optimizer=None,
         optimizer_args=None,
         **kwargs):
     Serializable.quick_init(self, locals())
     if optimizer is None:
         if optimizer_args is None:
             optimizer_args = dict()
         optimizer = ConjugateGradientOptimizer(**optimizer_args)
     super(TRPO, self).__init__(optimizer=optimizer, **kwargs)
Ejemplo n.º 30
0
 def __init__(
         self,
         observation_space,
         action_space):
     """
     :type observation_space: Space
     :type action_space: Space
     """
     Serializable.quick_init(self, locals())
     self._observation_space = observation_space
     self._action_space = action_space
Ejemplo n.º 31
0
    def __init__(
        self,
        base_kwargs,
        env,
        policy,
        initial_exploration_policy,
        qf1,
        qf2,
        vf,
        pool,
        plotter=None,
        lr=3e-3,
        scale_reward=1,
        discount=0.99,
        tau=0.01,
        target_update_interval=1,
        action_prior='uniform',
        reparameterize=False,
        save_full_state=False,
    ):
        """
        Args:
            base_kwargs (dict): dictionary of base arguments that are directly
                passed to the base `RLAlgorithm` constructor.

            env (`rllab.Env`): rllab environment object.
            policy: (`rllab.NNPolicy`): A policy function approximator.
            initial_exploration_policy: ('Policy'): A policy that we use
                for initial exploration which is not trained by the algorithm.

            qf1 (`valuefunction`): First Q-function approximator.
            qf2 (`valuefunction`): Second Q-function approximator. Usage of two
                Q-functions improves performance by reducing overestimation
                bias.
            vf (`ValueFunction`): Soft value function approximator.

            pool (`PoolBase`): Replay buffer to add gathered samples to.
            plotter (`QFPolicyPlotter`): Plotter instance to be used for
                visualizing Q-function during training.

            lr (`float`): Learning rate used for the function approximators.
            discount (`float`): Discount factor for Q-function updates.
            tau (`float`): Soft value function target update weight.
            target_update_interval ('int'): Frequency at which target network
                updates occur in iterations.

            reparameterize ('bool'): If True, we use a gradient estimator for
                the policy derived using the reparameterization trick. We use
                a likelihood ratio based estimator otherwise.
            save_full_state (`bool`): If True, save the full class in the
                snapshot. See `self.get_snapshot` for more information.
        """

        Serializable.quick_init(self, locals())
        super(SAC, self).__init__(**base_kwargs)

        self._env = env
        self._policy = policy
        self._initial_exploration_policy = initial_exploration_policy
        self._qf1 = qf1
        self._qf2 = qf2
        self._vf = vf
        self._pool = pool
        self._plotter = plotter

        self._policy_lr = lr
        self._qf_lr = lr
        self._vf_lr = lr
        self._scale_reward = scale_reward
        self._discount = discount
        self._tau = tau
        self._target_update_interval = target_update_interval
        self._action_prior = action_prior

        # Reparameterize parameter must match between the algorithm and the
        # policy actions are sampled from.
        assert reparameterize == self._policy._reparameterize
        self._reparameterize = reparameterize

        self._save_full_state = save_full_state

        self._Da = self._env.action_space.flat_dim
        self._Do = self._env.observation_space.flat_dim

        self._training_ops = list()

        self._init_placeholders()
        self._init_actor_update()
        self._init_critic_update()
        self._init_target_ops()

        # Initialize all uninitialized variables. This prevents initializing
        # pre-trained policy and qf and vf variables.
        uninit_vars = []
        for var in tf.global_variables():
            try:
                self._sess.run(var)
            except tf.errors.FailedPreconditionError:
                uninit_vars.append(var)
        self._sess.run(tf.variables_initializer(uninit_vars))
Ejemplo n.º 32
0
 def __init__(self, max_opt_itr=20, callback=None):
     Serializable.quick_init(self, locals())
     self._max_opt_itr = max_opt_itr
     self._opt_fun = None
     self._target = None
     self._callback = callback
    def __init__(
        self,
        env_spec,
        policy,
        recurrent=False,
        predict_all=True,
        obs_regressed='all',
        act_regressed='all',
        use_only_sign=False,
        noisify_traj_coef=0,
        optimizer=None,  # this defaults to LBFGS
        regressor_args=None,  # here goes all args straight to the regressor: hidden_sizes, TR, step_size....
    ):
        """
        :param predict_all: this is only for the recurrent case, to use all hidden states as predictions
        :param obs_regressed: list of index of the obs variables used to fit the regressor. default string 'all'
        :param act_regressed: list of index of the act variables used to fit the regressor. default string 'all'
        :param regressor_args:
        """
        self.env_spec = env_spec
        self.policy = policy
        self.latent_dim = policy.latent_dim
        self.recurrent = recurrent
        self.predict_all = predict_all
        self.use_only_sign = use_only_sign
        self.noisify_traj_coef = noisify_traj_coef
        self.regressor_args = regressor_args
        # decide what obs variables will be regressed upon
        if obs_regressed == 'all':
            self.obs_regressed = list(
                range(env_spec.observation_space.flat_dim))
        else:
            self.obs_regressed = obs_regressed
        # decide what action variables will be regressed upon
        if act_regressed == 'all':
            self.act_regressed = list(range(env_spec.action_space.flat_dim))
        else:
            self.act_regressed = act_regressed
        # shape the input dimension of the NN for the above decisions.
        self.obs_act_dim = len(self.obs_regressed) + len(self.act_regressed)

        Serializable.quick_init(self, locals())  # ??

        if regressor_args is None:
            regressor_args = dict()

        if optimizer == 'first_order':
            self.optimizer = FirstOrderOptimizer(
                max_epochs=10,  # both of these are to match Rocky's 10
                batch_size=128,
            )
        elif optimizer is None:
            self.optimizer = None
        else:
            raise NotImplementedError

        if policy.latent_name == 'bernoulli':
            if self.recurrent:
                self._regressor = BernoulliRecurrentRegressor(
                    input_shape=(self.obs_act_dim, ),
                    output_dim=policy.latent_dim,
                    optimizer=self.optimizer,
                    predict_all=self.predict_all,
                    **regressor_args)
            else:
                self._regressor = BernoulliMLPRegressor(
                    input_shape=(self.obs_act_dim, ),
                    output_dim=policy.latent_dim,
                    optimizer=self.optimizer,
                    **regressor_args)
        elif policy.latent_name == 'categorical':
            if self.recurrent:
                self._regressor = CategoricalRecurrentRegressor(  # not implemented
                    input_shape=(self.obs_act_dim, ),
                    output_dim=policy.latent_dim,
                    optimizer=self.optimizer,
                    # predict_all=self.predict_all,
                    **regressor_args)
            else:
                self._regressor = CategoricalMLPRegressor(
                    input_shape=(self.obs_act_dim, ),
                    output_dim=policy.latent_dim,
                    optimizer=self.optimizer,
                    **regressor_args)
        elif policy.latent_name == 'normal':
            self._regressor = GaussianMLPRegressor(
                input_shape=(self.obs_act_dim, ),
                output_dim=policy.latent_dim,
                optimizer=self.optimizer,
                **regressor_args)
        else:
            raise NotImplementedError
Ejemplo n.º 34
0
    def __init__(
            self,
            base_kwargs,
            env,
            policy,
            qf,
            vf,
            pool,
            plotter=None,
            lr=3e-3,
            scale_reward=1,
            discount=0.99,
            tau=0.01,
            target_update_interval=1,
            action_prior='uniform',
            save_full_state=False,
            #my
            entropy_coeff=1.,
            dynamic_coeff=False,
            clip_norm=None,
            resolution=25,
            test_N=1000,  # the number of action samples to estimate Q variance
    ):
        """
        Args:
            base_kwargs (dict): dictionary of base arguments that are directly
                passed to the base `RLAlgorithm` constructor.

            env (`rllab.Env`): rllab environment object.
            policy: (`rllab.NNPolicy`): A policy function approximator.
            qf (`ValueFunction`): Q-function approximator.
            vf (`ValueFunction`): Soft value function approximator.
            pool (`PoolBase`): Replay buffer to add gathered samples to.
            plotter (`QFPolicyPlotter`): Plotter instance to be used for
                visualizing Q-function during training.

            lr (`float`): Learning rate used for the function approximators.
            scale_reward (`float`): Scaling factor for raw reward.
            discount (`float`): Discount factor for Q-function updates.
            tau (`float`): Soft value function target update weight.

            save_full_state (`bool`): If True, save the full class in the
                snapshot. See `self.get_snapshot` for more information.
        """

        Serializable.quick_init(self, locals())
        super(SAC, self).__init__(**base_kwargs)

        self._env = env
        self._policy = policy
        self._qf = qf
        self._vf = vf
        self._pool = pool
        self._plotter = plotter

        self._policy_lr = lr
        self._qf_lr = lr
        self._vf_lr = lr
        self._scale_reward = scale_reward
        self._discount = discount
        self._tau = tau
        self._target_update_interval = target_update_interval
        self._action_prior = action_prior

        self._save_full_state = save_full_state

        self._Da = self._env.spec.action_space.flat_dim
        self._Do = self._env.spec.observation_space.flat_dim

        self._training_ops = list()

        # my
        self._loss_ops = []
        self._ec = tf.Variable(entropy_coeff, name='entropy_coeff')
        self.dynamic_ec = dynamic_coeff
        self.clip_norm = clip_norm

        self._init_placeholders()
        self._init_actor_update()
        self._init_critic_update()
        self._init_target_ops()

        # my
        self.resolution = resolution
        self.test_N = test_N
        if self.env.observation_space.flat_dim <= 2:
            self._init_state_importance()

        # Initialize all uninitialized variables. This prevents initializing
        # pre-trained policy and qf and vf variables.
        uninit_vars = []
        for var in tf.global_variables():
            try:
                self._sess.run(var)
            except tf.errors.FailedPreconditionError:
                uninit_vars.append(var)
        self._sess.run(tf.variables_initializer(uninit_vars))

        # my
        self._saver = tf.train.Saver()
Ejemplo n.º 35
0
    def __init__(
            self,
            env_spec,
            hidden_sizes=(32, 32),
            learn_std=True,
            init_std=1.0,
            adaptive_std=False,
            std_share_network=False,
            std_hidden_sizes=(32, 32),
            min_std=1e-6,
            std_hidden_nonlinearity=NL.tanh,
            hidden_nonlinearity=NL.tanh,
            output_nonlinearity=None,
            mean_network=None,
            std_network=None,
            dist_cls=DiagonalGaussian,
            output_gain=1,
    ):
        """
        :param env_spec:
        :param hidden_sizes: list of sizes for the fully-connected hidden layers
        :param learn_std: Is std trainable
        :param init_std: Initial std
        :param adaptive_std:
        :param std_share_network:
        :param std_hidden_sizes: list of sizes for the fully-connected layers for std
        :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues
        :param std_hidden_nonlinearity:
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param output_nonlinearity: nonlinearity for the output layer
        :param mean_network: custom network for the output mean
        :param std_network: custom network for the output log std
        :return:
        """
        Serializable.quick_init(self, locals())
        assert isinstance(env_spec.action_space, Box)

        obs_dim = env_spec.observation_space.flat_dim
        action_dim = env_spec.action_space.flat_dim

        # create network
        if mean_network is None:
            mean_network = MLP(
                input_shape=(obs_dim,),
                output_dim=action_dim,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=output_nonlinearity,
                output_W_init=LI.GlorotUniform(gain=output_gain)
            )
        self._mean_network = mean_network

        l_mean = mean_network.output_layer
        obs_var = mean_network.input_layer.input_var

        if std_network is not None:
            l_log_std = std_network.output_layer
        else:
            if adaptive_std:
                std_network = MLP(
                    input_shape=(obs_dim,),
                    input_layer=mean_network.input_layer,
                    output_dim=action_dim,
                    hidden_sizes=std_hidden_sizes,
                    hidden_nonlinearity=std_hidden_nonlinearity,
                    output_nonlinearity=None,
                )
                l_log_std = std_network.output_layer
            else:
                l_log_std = ParamLayer(
                    mean_network.input_layer,
                    num_units=action_dim,
                    param=lasagne.init.Constant(np.log(init_std)),
                    name="output_log_std",
                    trainable=learn_std,
                )

        self.min_std = min_std
        self._set_std_to_0 = False

        mean_var, log_std_var = L.get_output([l_mean, l_log_std])

        if self.min_std is not None:
            log_std_var = TT.maximum(log_std_var, np.log(min_std))

        self._mean_var, self._log_std_var = mean_var, log_std_var

        self._l_mean = l_mean
        self._l_log_std = l_log_std

        self._dist = dist_cls(action_dim)

        LasagnePowered.__init__(self, [l_mean, l_log_std])
        super(GaussianMLPPolicy, self).__init__(env_spec)

        self._f_dist = ext.compile_function(
            inputs=[obs_var],
            outputs=[mean_var, log_std_var],
        )
Ejemplo n.º 36
0
    def __init__(self, env_params, sumo_params, scenario):
        # Invoke serializable if using rllab

        if serializable_flag:
            Serializable.quick_init(self, locals())

        self.env_params = env_params
        self.scenario = scenario
        self.sumo_params = sumo_params
        time_stamp = ''.join(str(time.time()).split('.'))
        if os.environ.get("TEST_FLAG", 0):
            # 1.0 works with stress_test_start 10k times
            time.sleep(1.0 * int(time_stamp[-6:]) / 1e6)
        self.sumo_params.port = sumolib.miscutils.getFreeSocketPort()
        self.vehicles = scenario.vehicles
        self.traffic_lights = scenario.traffic_lights
        # time_counter: number of steps taken since the start of a rollout
        self.time_counter = 0
        # step_counter: number of total steps taken
        self.step_counter = 0
        # initial_state:
        #   Key = Vehicle ID,
        #   Entry = (type_id, route_id, lane_index, lane_pos, speed, pos)
        self.initial_state = {}
        self.state = None
        self.obs_var_labels = []

        # simulation step size
        self.sim_step = sumo_params.sim_step

        self.vehicle_arrangement_shuffle = \
            env_params.vehicle_arrangement_shuffle
        self.starting_position_shuffle = env_params.starting_position_shuffle

        # the available_routes variable contains a dictionary of routes
        # vehicles can traverse; to be used when routes need to be chosen
        # dynamically
        self.available_routes = self.scenario.rts

        # TraCI connection used to communicate with sumo
        self.traci_connection = None

        # dictionary of initial observations used while resetting vehicles
        # after each rollout
        self.initial_observations = dict.fromkeys(self.vehicles.get_ids())

        # store the initial vehicle ids
        self.initial_ids = deepcopy(self.vehicles.get_ids())

        # store the initial state of the vehicles class (for restarting sumo)
        self.initial_vehicles = deepcopy(self.vehicles)

        # colors used to distinguish between types of vehicles in the network
        self.colors = {}

        # contains the subprocess.Popen instance used to start traci
        self.sumo_proc = None

        self.start_sumo()
        self.setup_initial_state()

        # use pyglet to render the simulation
        if self.sumo_params.render in ['gray', 'dgray', 'rgb', 'drgb']:
            save_render = self.sumo_params.save_render
            sight_radius = self.sumo_params.sight_radius
            pxpm = self.sumo_params.pxpm
            show_radius = self.sumo_params.show_radius

            # get network polygons
            network = []
            for lane_id in self.traci_connection.lane.getIDList():
                _lane_poly = self.traci_connection.lane.getShape(lane_id)
                lane_poly = [i for pt in _lane_poly for i in pt]
                network.append(lane_poly)

            # instantiate a pyglet renderer
            self.renderer = Renderer(network,
                                     self.sumo_params.render,
                                     save_render,
                                     sight_radius=sight_radius,
                                     pxpm=pxpm,
                                     show_radius=show_radius)

            # render a frame
            self.render(reset=True)
        elif self.sumo_params.render in [True, False]:
            pass  # default to sumo-gui (if True) or sumo (if False)
        else:
            raise ValueError("Mode %s is not supported!" %
                             self.sumo_params.render)
Ejemplo n.º 37
0
    def __init__(
            self,
            input_shape,
            output_dim,
            prob_network=None,
            hidden_sizes=(32, 32),
            hidden_nonlinearity=NL.rectify,
            optimizer=None,
            use_trust_region=True,
            step_size=0.01,
            normalize_inputs=True,
            name=None,
    ):
        """
        :param input_shape: Shape of the input data.
        :param output_dim: Dimension of output.
        :param hidden_sizes: Number of hidden units of each layer of the mean network.
        :param hidden_nonlinearity: Non-linearity used for each layer of the mean network.
        :param optimizer: Optimizer for minimizing the negative log-likelihood.
        :param use_trust_region: Whether to use trust region constraint.
        :param step_size: KL divergence constraint for each iteration
        """
        Serializable.quick_init(self, locals())

        if optimizer is None:
            if use_trust_region:
                optimizer = PenaltyLbfgsOptimizer()
            else:
                optimizer = LbfgsOptimizer()

        self.output_dim = output_dim
        self._optimizer = optimizer

        if prob_network is None:
            prob_network = MLP(
                input_shape=input_shape,
                output_dim=output_dim,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=NL.softmax,
            )

        l_prob = prob_network.output_layer

        LasagnePowered.__init__(self, [l_prob])

        xs_var = prob_network.input_layer.input_var
        ys_var = TT.imatrix("ys")
        old_prob_var = TT.matrix("old_prob")

        x_mean_var = theano.shared(
            np.zeros((1,) + input_shape),
            name="x_mean",
            broadcastable=(True,) + (False, ) * len(input_shape)
        )
        x_std_var = theano.shared(
            np.ones((1,) + input_shape),
            name="x_std",
            broadcastable=(True,) + (False, ) * len(input_shape)
        )

        normalized_xs_var = (xs_var - x_mean_var) / x_std_var

        prob_var = L.get_output(l_prob, {prob_network.input_layer: normalized_xs_var})

        old_info_vars = dict(prob=old_prob_var)
        info_vars = dict(prob=prob_var)

        dist = self._dist = Categorical()

        mean_kl = TT.mean(dist.kl_sym(old_info_vars, info_vars))

        loss = - TT.mean(dist.log_likelihood_sym(ys_var, info_vars))

        predicted = special.to_onehot_sym(TT.argmax(prob_var, axis=1), output_dim)

        self._f_predict = ext.compile_function([xs_var], predicted)
        self._f_prob = ext.compile_function([xs_var], prob_var)
        self._l_prob = l_prob

        optimizer_args = dict(
            loss=loss,
            target=self,
            network_outputs=[prob_var],
        )

        if use_trust_region:
            optimizer_args["leq_constraint"] = (mean_kl, step_size)
            optimizer_args["inputs"] = [xs_var, ys_var, old_prob_var]
        else:
            optimizer_args["inputs"] = [xs_var, ys_var]

        self._optimizer.update_opt(**optimizer_args)

        self._use_trust_region = use_trust_region
        self._name = name

        self._normalize_inputs = normalize_inputs
        self._x_mean_var = x_mean_var
        self._x_std_var = x_std_var
Ejemplo n.º 38
0
    def __init__(
            self,
            env_spec,
            hidden_sizes=(32,),
            state_include_action=True,
            hidden_nonlinearity=NL.tanh,
            learn_std=True,
            init_std=1.0,
            output_nonlinearity=None,
            trunc_steps=20,
    ):
        """
        :param env_spec: A spec for the env.
        :param hidden_sizes: list of sizes for the fully connected hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :return:
        """
        Serializable.quick_init(self, locals())
        super(GaussianGRUPolicy, self).__init__(env_spec)

        assert len(hidden_sizes) == 1

        if state_include_action:
            obs_dim = env_spec.observation_space.flat_dim + env_spec.action_space.flat_dim
        else:
            obs_dim = env_spec.observation_space.flat_dim
        action_dim = env_spec.action_space.flat_dim

        mean_network = GRUNetwork(
            input_shape=(obs_dim,),
            output_dim=action_dim,
            hidden_dim=hidden_sizes[0],
            hidden_nonlinearity=hidden_nonlinearity,
            output_nonlinearity=output_nonlinearity,
            trunc_steps=trunc_steps
        )

        l_mean = mean_network.output_layer
        obs_var = mean_network.input_var

        l_log_std = ParamLayer(
            mean_network.input_layer,
            num_units=action_dim,
            param=lasagne.init.Constant(np.log(init_std)),
            name="output_log_std",
            trainable=learn_std,
        )

        l_step_log_std = ParamLayer(
            mean_network.step_input_layer,
            num_units=action_dim,
            param=l_log_std.param,
            name="step_output_log_std",
            trainable=learn_std,
        )

        self._mean_network = mean_network
        self._l_log_std = l_log_std
        self._state_include_action = state_include_action

        self._f_step_mean_std = ext.compile_function(
            [
                mean_network.step_input_layer.input_var,
                mean_network.step_prev_hidden_layer.input_var
            ],
            L.get_output([
                mean_network.step_output_layer,
                l_step_log_std,
                mean_network.step_hidden_layer
            ])
        )

        self._prev_action = None
        self._prev_hidden = None
        self._hidden_sizes = hidden_sizes
        self._dist = RecurrentDiagonalGaussian(action_dim)

        self.reset()

        LasagnePowered.__init__(self, [mean_network.output_layer, l_log_std])
Ejemplo n.º 39
0
    def __init__(
        self,
        name,
        env_spec,
        hidden_sizes=(32, 32),
        learn_std=True,
        init_std=1.0,
        adaptive_std=False,
        std_share_network=False,
        std_hidden_sizes=(32, 32),
        min_std=1e-6,
        max_std=1000.0,
        std_modifier=1.0,
        std_hidden_nonlinearity=tf.nn.tanh,
        hidden_nonlinearity=tf.nn.tanh,
        output_nonlinearity=tf.identity,
        mean_network=None,
        std_network=None,
        std_parametrization='exp',
        grad_step_size=1.0,
        stop_grad=False,
        extra_input_dim=0,
        # metalearn_baseline=False,
    ):
        """
        :param env_spec:
        :param hidden_sizes: list of sizes for the fully-connected hidden layers
        :param learn_std: Is std trainable
        :param init_std: Initial std
        :param adaptive_std:
        :param std_share_network:
        :param std_hidden_sizes: list of sizes for the fully-connected layers for std
        :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues
        :param std_hidden_nonlinearity:
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param output_nonlinearity: nonlinearity for the output layer
        :param mean_network: custom network for the output mean
        :param std_network: custom network for the output log std
        :param std_parametrization: how the std should be parametrized. There are a few options:
            - exp: the logarithm of the std will be stored, and applied a exponential transformation
            - softplus: the std will be computed as log(1+exp(x))
        :param grad_step_size: the step size taken in the learner's gradient update, sample uniformly if it is a range e.g. [0.1,1]
        :param stop_grad: whether or not to stop the gradient through the gradient.
        :return:
        """
        Serializable.quick_init(self, locals())
        #assert isinstance(env_spec.action_space, Box)

        obs_dim = env_spec.observation_space.flat_dim
        self.action_dim = env_spec.action_space.flat_dim
        self.n_hidden = len(hidden_sizes)
        self.hidden_nonlinearity = hidden_nonlinearity
        self.output_nonlinearity = output_nonlinearity
        self.input_shape = (
            None,
            obs_dim + extra_input_dim,
        )
        self.step_size = grad_step_size
        self.stop_grad = stop_grad
        # self.metalearn_baseline = metalearn_baseline
        if type(self.step_size) == list:
            raise NotImplementedError('removing this since it didnt work well')

        # create network
        if mean_network is None:
            self.all_params = self.create_MLP(  # TODO: this should not be a method of the policy! --> helper
                name="mean_network",
                output_dim=self.action_dim,
                hidden_sizes=hidden_sizes,
            )
            self.input_tensor, _ = self.forward_MLP(
                'mean_network',
                self.all_params,
                reuse=None  # Need to run this for batch norm
            )
            forward_mean = lambda x, params, is_train: self.forward_MLP(
                'mean_network',
                all_params=params,
                input_tensor=x,
                is_training=is_train)[1]
        else:
            raise NotImplementedError('Not supported.')

        if std_network is not None:
            raise NotImplementedError('Not supported.')
        else:
            if adaptive_std:
                raise NotImplementedError('Not supported.')
            else:
                if std_parametrization == 'exp':
                    init_std_param = np.log(init_std)
                elif std_parametrization == 'softplus':
                    init_std_param = np.log(np.exp(init_std) - 1)
                else:
                    raise NotImplementedError
                self.all_params['std_param'] = make_param_layer(
                    num_units=self.action_dim,
                    param=tf.constant_initializer(init_std_param),
                    name="output_std_param",
                    trainable=learn_std,
                )
                forward_std = lambda x, params: forward_param_layer(
                    x, params['std_param'])
            self.all_param_vals = None

            # unify forward mean and forward std into a single function
            self._forward = lambda obs, params, is_train: (forward_mean(
                obs, params, is_train), forward_std(obs, params))

            self.std_parametrization = std_parametrization

            if std_parametrization == 'exp':
                min_std_param = np.log(min_std)
                max_std_param = np.log(max_std)
            elif std_parametrization == 'softplus':
                min_std_param = np.log(np.exp(min_std) - 1)
                max_std_param = np.log(np.exp(max_std) - 1)
            else:
                raise NotImplementedError

            self.min_std_param = min_std_param  # TODO: change these to min_std_param_raw
            self.max_std_param = max_std_param
            self.std_modifier = np.float64(std_modifier)
            # print(self.std_modifier)
            # self.std_modifier = 0.00001 #np.float64(std_modifier)
            #print("initializing max_std debug4", self.min_std_param, self.max_std_param)

            self._dist = DiagonalGaussian(self.action_dim)

            self._cached_params = {}

            super(MAMLGaussianMLPPolicy, self).__init__(env_spec)

            dist_info_sym = self.dist_info_sym(self.input_tensor,
                                               dict(),
                                               is_training=False)
            mean_var = dist_info_sym["mean"]
            log_std_var = dist_info_sym["log_std"]

            # pre-update policy
            self._init_f_dist = tensor_utils.compile_function(
                inputs=[self.input_tensor],
                outputs=[mean_var, log_std_var],
            )
            self._cur_f_dist = self._init_f_dist
Ejemplo n.º 40
0
 def __init__(self, ctrl_cost_coeff=1e-2, *args, **kwargs):
     self.ctrl_cost_coeff = ctrl_cost_coeff
     super(SwimmerEnv, self).__init__(*args, **kwargs)
     Serializable.quick_init(self, locals())
    def __init__(
            self,
            name,
            env_spec,
            hidden_sizes=(32, 32),
            learn_std=True,
            init_std=1.0,
            adaptive_std=False,
            std_share_network=False,
            std_hidden_sizes=(32, 32),
            min_std=1e-6,
            std_hidden_nonlinearity=tf.nn.tanh,
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
            mean_network=None,
            std_network=None,
            std_parametrization='exp'
    ):
        """
        :param env_spec:
        :param hidden_sizes: list of sizes for the fully-connected hidden layers
        :param learn_std: Is std trainable
        :param init_std: Initial std
        :param adaptive_std:
        :param std_share_network:
        :param std_hidden_sizes: list of sizes for the fully-connected layers for std
        :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues
        :param std_hidden_nonlinearity:
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param output_nonlinearity: nonlinearity for the output layer
        :param mean_network: custom network for the output mean
        :param std_network: custom network for the output log std
        :param std_parametrization: how the std should be parametrized. There are a few options:
            - exp: the logarithm of the std will be stored, and applied a exponential transformation
            - softplus: the std will be computed as log(1+exp(x))
        :return:
        """
        Serializable.quick_init(self, locals())
        assert isinstance(env_spec.action_space, Box)

        with tf.variable_scope(name):

            obs_dim = env_spec.observation_space.flat_dim
            action_dim = env_spec.action_space.flat_dim

            # create network
            if mean_network is None:
                mean_network = MLP(
                    name="mean_network",
                    input_shape=(obs_dim*2,),
                    output_dim=action_dim,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=output_nonlinearity,
                )
            self._mean_network = mean_network

            l_mean = mean_network.output_layer
            obs_var = mean_network.input_layer.input_var

            if std_network is not None:
                l_std_param = std_network.output_layer
            else:
                if adaptive_std:
                    std_network = MLP(
                        name="std_network",
                        input_shape=(obs_dim*2,),
                        input_layer=mean_network.input_layer,
                        output_dim=action_dim,
                        hidden_sizes=std_hidden_sizes,
                        hidden_nonlinearity=std_hidden_nonlinearity,
                        output_nonlinearity=None,
                    )
                    l_std_param = std_network.output_layer
                else:
                    if std_parametrization == 'exp':
                        init_std_param = np.log(init_std)
                    elif std_parametrization == 'softplus':
                        init_std_param = np.log(np.exp(init_std) - 1)
                    else:
                        raise NotImplementedError
                    l_std_param = L.ParamLayer(
                        mean_network.input_layer,
                        num_units=action_dim,
                        param=tf.constant_initializer(init_std_param),
                        name="output_std_param",
                        trainable=learn_std,
                    )

            self.std_parametrization = std_parametrization

            if std_parametrization == 'exp':
                min_std_param = np.log(min_std)
            elif std_parametrization == 'softplus':
                min_std_param = np.log(np.exp(min_std) - 1)
            else:
                raise NotImplementedError

            self.min_std_param = min_std_param

            # mean_var, log_std_var = L.get_output([l_mean, l_std_param])
            #
            # if self.min_std_param is not None:
            #     log_std_var = tf.maximum(log_std_var, np.log(min_std))
            #
            # self._mean_var, self._log_std_var = mean_var, log_std_var

            self._l_mean = l_mean
            self._l_std_param = l_std_param

            self._dist = DiagonalGaussian(action_dim)

            LayersPowered.__init__(self, [l_mean, l_std_param])
            super(GaussianMLPInversePolicy, self).__init__(env_spec)

            dist_info_sym = self.dist_info_sym(mean_network.input_layer.input_var, dict())
            mean_var = dist_info_sym["mean"]
            log_std_var = dist_info_sym["log_std"]

            self._f_dist = tensor_utils.compile_function(
                inputs=[obs_var],
                outputs=[mean_var, log_std_var],
            )
Ejemplo n.º 42
0
    def __init__(
            self,
            env_spec,
            hidden_sizes=(32, 32),
            hidden_nonlinearity=NL.rectify,
            hidden_W_init=lasagne.init.HeUniform(),
            hidden_b_init=lasagne.init.Constant(0.),
            action_merge_layer=-2,
            output_nonlinearity=None,
            output_W_init=lasagne.init.Uniform(-3e-3, 3e-3),
            output_b_init=lasagne.init.Uniform(-3e-3, 3e-3),
            bn=False):
        Serializable.quick_init(self, locals())

        l_obs = L.InputLayer(shape=(None, env_spec.observation_space.flat_dim), name="obs")
        l_action = L.InputLayer(shape=(None, env_spec.action_space.flat_dim), name="actions")

        n_layers = len(hidden_sizes) + 1

        if n_layers > 1:
            action_merge_layer = \
                (action_merge_layer % n_layers + n_layers) % n_layers
        else:
            action_merge_layer = 1

        l_hidden = l_obs

        for idx, size in enumerate(hidden_sizes):
            if bn:
                l_hidden = batch_norm(l_hidden)

            if idx == action_merge_layer:
                l_hidden = L.ConcatLayer([l_hidden, l_action])

            l_hidden = L.DenseLayer(
                l_hidden,
                num_units=size,
                W=hidden_W_init,
                b=hidden_b_init,
                nonlinearity=hidden_nonlinearity,
                name="h%d" % (idx + 1)
            )

        if action_merge_layer == n_layers:
            l_hidden = L.ConcatLayer([l_hidden, l_action])

        l_output = L.DenseLayer(
            l_hidden,
            num_units=1,
            W=output_W_init,
            b=output_b_init,
            nonlinearity=output_nonlinearity,
            name="output"
        )

        output_var = L.get_output(l_output, deterministic=True).flatten()

        self._f_qval = ext.compile_function([l_obs.input_var, l_action.input_var], output_var)
        self._output_layer = l_output
        self._obs_layer = l_obs
        self._action_layer = l_action
        self._output_nonlinearity = output_nonlinearity

        LasagnePowered.__init__(self, [l_output])
Ejemplo n.º 43
0
    def __init__(
            self,
            base_kwargs,
            env,
            arr_actor,
            best_actor,
            dict_ph,
            arr_initial_exploration_policy,
            with_best = False,
            initial_beta_t = 1,
            plotter=None,
            specific_type=0,

            target_noise_scale=0.2,
            target_noise_clip=0.5,
            target_ratio=2,
            target_range=0.04,
            lr=3e-3,
            discount=0.99,
            tau=0.01,
            policy_update_interval=2,
            best_update_interval=2,
            reparameterize=False,

            save_full_state=False,
    ):
        """
        Args:
            base_kwargs (dict): dictionary of base arguments that are directly
                passed to the base `RLAlgorithm` constructor.

            env (`rllab.Env`): rllab environment object. 
            policy: (`rllab.NNPolicy`): A policy function approximator.
            initial_exploration_policy: ('Policy'): A policy that we use
                for initial exploration which is not trained by the algorithm.

            qf1 (`valuefunction`): First Q-function approximator.
            qf2 (`valuefunction`): Second Q-function approximator. Usage of two
                Q-functions improves performance by reducing overestimation
                bias.
            vf (`ValueFunction`): Soft value function approximator.

            pool (`PoolBase`): Replay buffer to add gathered samples to.
            plotter (`QFPolicyPlotter`): Plotter instance to be used for
                visualizing Q-function during training.

            lr (`float`): Learning rate used for the function approximators.
            discount (`float`): Discount factor for Q-function updates.
            tau (`float`): Soft value function target update weight.
            target_update_interval ('int'): Frequency at which target network
                updates occur in iterations.

            reparameterize ('bool'): If True, we use a gradient estimator for
                the policy derived using the reparameterization trick. We use
                a likelihood ratio based estimator otherwise. 
            save_full_state (`bool`): If True, save the full class in the
                snapshot. See `self.get_snapshot` for more information.
        """

        Serializable.quick_init(self, locals())
        super(P3S_TD3, self).__init__(**base_kwargs)

        self._env = env
        self._max_actions = int(self._env.action_space.high[0])

        self._arr_actor = arr_actor
        self._best_actor = best_actor
        self._best_actor_num = -1
        self._num_iter_select_best = 1

        assert len(self._env.envs) == len(self._arr_actor)
        self._num_actor = len(self._arr_actor)
        self._n_train_repeat = self._num_actor
        self._dict_ph = dict_ph

        self._arr_initial_exploration_policy = arr_initial_exploration_policy
        self._with_best = with_best
        self._best_flag = np.ones(self._num_actor)
        self._beta_t = initial_beta_t
        self._plotter = plotter

        self._target_noise_scale = target_noise_scale
        self._target_noise_clip = target_noise_clip

        self._target_ratio = target_ratio
        self._target_range = target_range
        self._policy_lr = lr
        self._qf_lr = lr
        self._vf_lr = lr
        self._discount = discount
        self._tau = tau
        self._policy_update_interval = policy_update_interval
        self._best_update_interval = best_update_interval

        # Reparameterize parameter must match between the algorithm and the 
        # policy actions are sampled from.

        self._save_full_state = save_full_state
        self._saver = tf.train.Saver(max_to_keep=1000)
        self._save_dir = '/home/wisrl/wyjung/Result/log/Mujoco/ant_delay20/test_IPE_TD3_NA4_TRatio2_Trange0.03_update1_ver3_new_201906/iter6/'
        # '/test_IPE_TD3_NA' + str(NUM_ACTORS) + '_TRatio' + str(TARGET_RATIO) + '_TRange' + str(
        #     TARGET_RANGE) + '_update' + str(UPDATE_BEST_ITER) + '_ver' + str(VERSION) + '_new_201906'
        self._save_iter_num = 40000

        self._Da = self._env.action_space.flat_dim
        self._Do = self._env.observation_space.flat_dim

        if self._best_actor is not None:
            self._init_critic_update(actor=self._best_actor)
            self._init_actor_update(actor=self._best_actor)
            self._init_target_ops(actor=self._best_actor)

        for actor in self._arr_actor:
            self._init_critic_update(actor=actor)
            self._init_actor_update(actor=actor)
            self._init_target_ops(actor=actor)
            self._init_update_old_new_ops(actor=actor)

        self._sess.run(tf.variables_initializer([
            variable for variable in tf.global_variables()
            if 'low_level_policy' not in variable.name
        ]))

        self._update_old_new()

        for actor in self._arr_actor:
            source_params = actor.current_params()
            target_params = actor.target_params()
            copy_ops = [
                tf.assign(target, source)
                for target, source in zip(target_params, source_params)
            ]

            self._sess.run(copy_ops)

        if self._best_actor is not None:
            source_params = self._best_actor.current_params()
            target_params = self._best_actor.target_params()
            copy_ops = [
                tf.assign(target, source)
                for target, source in zip(target_params, source_params)
            ]

            self._sess.run(copy_ops)

            for actor in self._arr_actor:
                source_params = self._best_actor.trainable_params()
                target_params = actor.trainable_params()

                copy_ops = [
                    tf.assign(target, source)
                    for target, source in zip(target_params, source_params)
                ]

                self._sess.run(copy_ops)

        print("Initialization is finished!")
    def __init__(
        self,
        name,
        env_spec,
        hidden_sizes=(32, 32),
        learn_std=True,
        init_std=1.0,
        adaptive_std=False,
        std_share_network=False,
        std_hidden_sizes=(32, 32),
        min_std=1e-6,
        std_hidden_nonlinearity=tf.nn.tanh,
        hidden_nonlinearity=tf.nn.tanh,
        output_nonlinearity=tf.identity,
        mean_network=None,
        std_network=None,
        std_parametrization='exp',
        std_modifier=1.0,
        extra_input_dim=0,
    ):
        """
        :param env_spec:
        :param hidden_sizes: list of sizes for the fully-connected hidden layers
        :param learn_std: Is std trainable
        :param init_std: Initial std
        :param adaptive_std:
        :param std_share_network:
        :param std_hidden_sizes: list of sizes for the fully-connected layers for std
        :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues
        :param std_hidden_nonlinearity:
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param output_nonlinearity: nonlinearity for the output layer
        :param mean_network: custom network for the output mean
        :param std_network: custom network for the output log std
        :param std_parametrization: how the std should be parametrized. There are a few options:
            - exp: the logarithm of the std will be stored, and applied a exponential transformation
            - softplus: the std will be computed as log(1+exp(x))
        :return:
        """
        Serializable.quick_init(self, locals())
        assert isinstance(env_spec.action_space, Box)

        obs_dim = env_spec.observation_space.flat_dim
        action_dim = env_spec.action_space.flat_dim

        # create network
        if mean_network is None:
            self.mean_params = mean_params = self.create_MLP(
                name="mean_network",
                input_shape=(
                    None,
                    obs_dim + extra_input_dim,
                ),
                output_dim=action_dim,
                hidden_sizes=hidden_sizes,
            )
            self.input_tensor, mean_tensor = self.forward_MLP(
                'mean_network',
                mean_params,
                n_hidden=len(hidden_sizes),
                input_shape=(obs_dim, ),
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=output_nonlinearity,
                reuse=None  # Needed for batch norm
            )
            # if you want to input your own thing.
            self._forward_mean = lambda x, is_train: self.forward_MLP(
                'mean_network',
                mean_params,
                n_hidden=len(hidden_sizes),
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=output_nonlinearity,
                input_tensor=x,
                is_training=is_train)[1]
        else:
            raise NotImplementedError('Chelsea does not support this.')

        if std_network is not None:
            raise NotImplementedError(
                'Minimal Gaussian MLP does not support this.')
        else:
            if adaptive_std:
                # NOTE - this branch isn't tested
                raise NotImplementedError(
                    'Minimal Gaussian MLP doesnt have a tested version of this.'
                )
                self.std_params = std_params = self.create_MLP(
                    name="std_network",
                    input_shape=(
                        None,
                        obs_dim,
                    ),
                    output_dim=action_dim,
                    hidden_sizes=std_hidden_sizes,
                )
                # if you want to input your own thing.
                self._forward_std = lambda x: self.forward_MLP(
                    'std_network',
                    std_params,
                    n_hidden=len(hidden_sizes),
                    hidden_nonlinearity=std_hidden_nonlinearity,
                    output_nonlinearity=tf.identity,
                    input_tensor=x)[1]
            else:
                if std_parametrization == 'exp':
                    init_std_param = np.log(init_std)
                elif std_parametrization == 'softplus':
                    init_std_param = np.log(np.exp(init_std) - 1)
                else:
                    raise NotImplementedError
                self.std_params = make_param_layer(
                    num_units=action_dim,
                    param=tf.constant_initializer(init_std_param),
                    name="output_std_param",
                    trainable=learn_std,
                )
                self._forward_std = lambda x: forward_param_layer(
                    x, self.std_params)

        self.std_parametrization = std_parametrization

        if std_parametrization == 'exp':
            min_std_param = np.log(min_std)
        elif std_parametrization == 'softplus':
            min_std_param = np.log(np.exp(min_std) - 1)
        else:
            raise NotImplementedError

        self.min_std_param = min_std_param
        self.std_modifier = std_modifier

        self._dist = DiagonalGaussian(action_dim)

        self._cached_params = {}

        super(GaussianMLPPolicy, self).__init__(env_spec)

        dist_info_sym = self.dist_info_sym(self.input_tensor,
                                           dict(),
                                           is_training=False)
        mean_var = dist_info_sym["mean"]
        log_std_var = dist_info_sym["log_std"]

        self._init_f_dist = tensor_utils.compile_function(
            inputs=[self.input_tensor],
            outputs=[mean_var, log_std_var],
        )
        self._cur_f_dist = self._init_f_dist
Ejemplo n.º 45
0
    def __init__(
        self,
        env_spec,
        env,  # the inner one, I believe
        pkl_path=None,  # for the entire hierarchical policy
        snn_pkl_path=None,
        snn_json_path=None,
        manager_pkl_path=None,  # default is to initialize a new manager from scratch
        min_period=1,
        max_period=10,  # possible periods
        latent_dim=6,
        bilinear_integration=True,
        trainable_snn=True,
        trainable_manager=True,
        continuous_latent=False,
        hidden_sizes_snn=(64, 64),
        hidden_sizes_selector=(32, 32)):
        StochasticPolicy.__init__(self, env_spec)
        self.env = env
        self.periods = np.arange(min_period, max_period + 1)
        assert len(self.periods) > 0
        self.curr_period = self.periods[0]
        self.max_period = max(self.periods)
        self.latent_dim = latent_dim  # unsure
        self.bilinear_integration = bilinear_integration  # unsure
        self.count = 0  # keep track of how long it's been since sampling a latent skill
        self.curr_latent = None  # something
        self.outer_action_space = spaces.Discrete(latent_dim)
        self.trainable_manager = trainable_manager
        self.random_period = True
        self.fake_env = PeriodVaryingEnv(env)

        self.continuous_latent = continuous_latent
        self.trainable_snn = trainable_snn

        if pkl_path and '.npz' not in pkl_path:
            data = joblib.load(os.path.join(config.PROJECT_PATH, pkl_path))
            policy = data['policy']
            self.manager = policy.manager
            self.low_policy = policy.low_policy

            # following two lines used for random manager
            # outer_env_spec = EnvSpec(observation_space=self.env.observation_space, action_space=self.outer_action_space)
            # self.manager = CategoricalMLPPolicy(env_spec=outer_env_spec, latent_dim=latent_dim, )
        else:
            # env spec that includes the extra parameter for time
            self.low_policy = GaussianMLPPolicy_snn_hier(
                env_spec=self.fake_env.spec,
                env=self.fake_env,
                pkl_path=snn_pkl_path,
                json_path=snn_json_path,
                trainable_snn=trainable_snn,
                latent_dim=latent_dim,
                bilinear_integration=bilinear_integration,
                external_latent=True,
                hidden_sizes_snn=hidden_sizes_snn,
                hidden_sizes_selector=hidden_sizes_selector)

            # loading manager from pkl file
            if manager_pkl_path:
                manager_data = joblib.load(
                    os.path.join(config.PROJECT_PATH, manager_pkl_path))
                self.manager = manager_data['policy']
                print("loaded manager")
            else:
                # self.outer_env = hierarchize_snn(self.env, time_steps_agg=10, pkl_path=snn_pkl_path)
                if self.continuous_latent:
                    outer_env_spec = EnvSpec(
                        observation_space=self.fake_env.observation_space,
                        action_space=spaces.Box(-1.0,
                                                1.0,
                                                shape=(latent_dim, )))
                    self.manager = GaussianMLPPolicy(env_spec=outer_env_spec)
                else:
                    outer_env_spec = EnvSpec(
                        observation_space=self.fake_env.observation_space,
                        action_space=self.outer_action_space)
                    self.manager = CategoricalMLPPolicy(
                        env_spec=outer_env_spec,
                        latent_dim=latent_dim,
                    )
                if pkl_path is not None and '.npz' in pkl_path:
                    param_dict = dict(
                        np.load(os.path.join(config.PROJECT_PATH, pkl_path)))
                    param_values = param_dict['params']
                    self.set_param_values(param_values)

        if isinstance(env, MazeEnv) or isinstance(env, GatherEnv):
            self.obs_robot_dim = env.robot_observation_space.flat_dim
            self.obs_maze_dim = env.maze_observation_space.flat_dim
        elif isinstance(env, NormalizedEnv):
            if isinstance(env.wrapped_env, MazeEnv) or isinstance(
                    env.wrapped_env, GatherEnv):
                self.obs_robot_dim = env.wrapped_env.robot_observation_space.flat_dim
                self.obs_maze_dim = env.wrapped_env.maze_observation_space.flat_dim
            else:
                self.obs_robot_dim = env.wrapped_env.observation_space.flat_dim
                self.obs_maze_dim = 0
        else:
            self.obs_robot_dim = env.observation_space.flat_dim
            self.obs_maze_dim = 0
        Serializable.quick_init(self,
                                locals())  # todo: ask if this fixes my problem
Ejemplo n.º 46
0
 def __init__(self, env_spec, endpoints, outside_value):
     Serializable.quick_init(self, locals())
     self._env_spec = env_spec
     self.schedule = schedules.PiecewiseSchedule(
         endpoints=endpoints, outside_value=outside_value)
Ejemplo n.º 47
0
    def __init__(self,
                 env_spec,
                 hidden_dim=32,
                 feature_network=None,
                 state_include_action=True,
                 hidden_nonlinearity=NL.tanh):
        """
        :param env_spec: A spec for the env.
        :param hidden_dim: dimension of hidden layer
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :return:
        """
        assert isinstance(env_spec.action_space, Discrete)
        Serializable.quick_init(self, locals())
        super(CategoricalGRUPolicy, self).__init__(env_spec)

        obs_dim = env_spec.observation_space.flat_dim
        action_dim = env_spec.action_space.flat_dim

        if state_include_action:
            input_dim = obs_dim + action_dim
        else:
            input_dim = obs_dim

        l_input = L.InputLayer(shape=(None, None, input_dim), name="input")

        if feature_network is None:
            feature_dim = input_dim
            l_flat_feature = None
            l_feature = l_input
        else:
            feature_dim = feature_network.output_layer.output_shape[-1]
            l_flat_feature = feature_network.output_layer
            l_feature = OpLayer(
                l_flat_feature,
                extras=[l_input],
                name="reshape_feature",
                op=lambda flat_feature, input: TT.reshape(
                    flat_feature,
                    [input.shape[0], input.shape[1], feature_dim]),
                shape_op=lambda _, input_shape:
                (input_shape[0], input_shape[1], feature_dim))

        prob_network = GRUNetwork(input_shape=(feature_dim, ),
                                  input_layer=l_feature,
                                  output_dim=env_spec.action_space.n,
                                  hidden_dim=hidden_dim,
                                  hidden_nonlinearity=hidden_nonlinearity,
                                  output_nonlinearity=TT.nnet.softmax,
                                  name="prob_network")

        self.prob_network = prob_network
        self.feature_network = feature_network
        self.l_input = l_input
        self.state_include_action = state_include_action

        flat_input_var = TT.matrix("flat_input")
        if feature_network is None:
            feature_var = flat_input_var
        else:
            feature_var = L.get_output(
                l_flat_feature, {feature_network.input_layer: flat_input_var})

        self.f_step_prob = ext.compile_function(
            [flat_input_var, prob_network.step_prev_hidden_layer.input_var],
            L.get_output([
                prob_network.step_output_layer, prob_network.step_hidden_layer
            ], {prob_network.step_input_layer: feature_var}))

        self.input_dim = input_dim
        self.action_dim = action_dim
        self.hidden_dim = hidden_dim

        self.prev_action = None
        self.prev_hidden = None
        self.dist = RecurrentCategorical(env_spec.action_space.n)

        out_layers = [prob_network.output_layer]
        if feature_network is not None:
            out_layers.append(feature_network.output_layer)

        LasagnePowered.__init__(self, out_layers)