Ejemplo n.º 1
0
 def __init__(self, optimizer=None, optimizer_args=None, **kwargs):
     Serializable.quick_init(self, locals())
     if optimizer is None:
         if optimizer_args is None:
             optimizer_args = dict()
         optimizer = PenaltyLbfgsOptimizer(**optimizer_args)
     super(PPO, self).__init__(optimizer=optimizer, **kwargs)
Ejemplo n.º 2
0
 def __init__(self,
              optimizer=None,
              optimizer_args=None,
              step_size=0.01,
              **kwargs):
     if optimizer is None:
         if optimizer_args is None:
             optimizer_args = dict()
         optimizer = PenaltyLbfgsOptimizer(**optimizer_args)
     self.optimizer = optimizer
     self.step_size = step_size
     super(NPO, self).__init__(**kwargs)
Ejemplo n.º 3
0
 def __init__(self,
              optimizer=None,
              optimizer_args=None,
              step_size=0.01,
              **kwargs):
     if optimizer is None:
         if optimizer_args is None:
             optimizer_args = dict()
         optimizer = PenaltyLbfgsOptimizer(**optimizer_args)
     self.optimizer = optimizer
     self.step_size = step_size
     super(NPO, self).__init__(**kwargs)
     print('NPO with exploration initialized')
Ejemplo n.º 4
0
 def __init__(self,
              optimizer=None,
              optimizer_args=None,
              step_size=0.01,
              truncate_local_is_ratio=None,
              **kwargs):
     if optimizer is None:
         if optimizer_args is None:
             optimizer_args = dict()
         optimizer = PenaltyLbfgsOptimizer(**optimizer_args)
     self.optimizer = optimizer
     self.step_size = step_size
     self.truncate_local_is_ratio = truncate_local_is_ratio
     super(NPO, self).__init__(**kwargs)
Ejemplo n.º 5
0
 def __init__(self,
              optimizer=None,
              optimizer_args=None,
              step_size=0.01,
              agentNum=1,
              **kwargs):
     if optimizer is None:
         if optimizer_args is None:
             optimizer_args = dict()
         optimizer = PenaltyLbfgsOptimizer(**optimizer_args)
     self.optimizer = []
     self.agentNum = agentNum
     for i in xrange(agentNum):
         self.optimizer.append(copy.deepcopy(optimizer))
     self.step_size = step_size
     super(CGM, self).__init__(**kwargs)
Ejemplo n.º 6
0
    def _buildRLAlg(useCG, algDict, optimizerArgs=None):
        #RL Algorithm
        if optimizerArgs is None:
            optimizerArgs = dict()

        if useCG:
            #either use CG optimizer == TRPO
            optimizer = ConjugateGradientOptimizer(**optimizerArgs)
        #or use BFGS optimzier == penalized policy optimization TODO can this be an avenue to PPO? does it not require also liklihood truncation?
        else:
            optimizer = PenaltyLbfgsOptimizer(**optimizerArgs)
        #NPO is expecting in ctor :
        #self.optimizer = optimizer - need to specify this or else defaults to PenaltyLbfgsOptimizer
        #self.step_size = step_size : defaults to 0.01
        #truncate_local_is_ratio means to truncate distribution likelihood ration, which is defined as
        #  lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars)
        # if truncation is not none : lr = TT.minimum(self.truncate_local_is_ratio, lr)
        #self.truncate_local_is_ratio = truncate_local_is_ratio

        algo = NPO(optimizer=optimizer, **algDict)
        return algo
Ejemplo n.º 7
0
    def __init__(
        self,
        input_shape,
        output_dim,
        mean_network=None,
        hidden_sizes=(32, 32),
        hidden_nonlinearity=NL.rectify,
        optimizer=None,
        use_trust_region=True,
        step_size=0.01,
        learn_std=True,
        init_std=1.0,
        adaptive_std=False,
        std_share_network=False,
        std_hidden_sizes=(32, 32),
        std_nonlinearity=None,
        normalize_inputs=True,
        normalize_outputs=True,
        name=None,
    ):
        """
        :param input_shape: Shape of the input data.
        :param output_dim: Dimension of output.
        :param hidden_sizes: Number of hidden units of each layer of the mean network.
        :param hidden_nonlinearity: Non-linearity used for each layer of the mean network.
        :param optimizer: Optimizer for minimizing the negative log-likelihood.
        :param use_trust_region: Whether to use trust region constraint.
        :param step_size: KL divergence constraint for each iteration
        :param learn_std: Whether to learn the standard deviations. Only effective if adaptive_std is False. If
        adaptive_std is True, this parameter is ignored, and the weights for the std network are always learned.
        :param adaptive_std: Whether to make the std a function of the states.
        :param std_share_network: Whether to use the same network as the mean.
        :param std_hidden_sizes: Number of hidden units of each layer of the std network. Only used if
        `std_share_network` is False. It defaults to the same architecture as the mean.
        :param std_nonlinearity: Non-linearity used for each layer of the std network. Only used if `std_share_network`
        is False. It defaults to the same non-linearity as the mean.
        """
        Serializable.quick_init(self, locals())

        if optimizer is None:
            if use_trust_region:
                optimizer = PenaltyLbfgsOptimizer()
            else:
                optimizer = LbfgsOptimizer()

        self._optimizer = optimizer

        if mean_network is None:
            mean_network = MLP(
                input_shape=input_shape,
                output_dim=output_dim,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=None,
            )

        l_mean = mean_network.output_layer

        if adaptive_std:
            l_log_std = MLP(
                input_shape=input_shape,
                input_var=mean_network.input_layer.input_var,
                output_dim=output_dim,
                hidden_sizes=std_hidden_sizes,
                hidden_nonlinearity=std_nonlinearity,
                output_nonlinearity=None,
            ).output_layer
        else:
            l_log_std = ParamLayer(
                mean_network.input_layer,
                num_units=output_dim,
                param=lasagne.init.Constant(np.log(init_std)),
                name="output_log_std",
                trainable=learn_std,
            )

        LasagnePowered.__init__(self, [l_mean, l_log_std])

        xs_var = mean_network.input_layer.input_var
        ys_var = TT.matrix("ys")
        old_means_var = TT.matrix("old_means")
        old_log_stds_var = TT.matrix("old_log_stds")

        x_mean_var = theano.shared(np.zeros((1, ) + input_shape),
                                   name="x_mean",
                                   broadcastable=(True, ) +
                                   (False, ) * len(input_shape))
        x_std_var = theano.shared(np.ones((1, ) + input_shape),
                                  name="x_std",
                                  broadcastable=(True, ) +
                                  (False, ) * len(input_shape))
        y_mean_var = theano.shared(np.zeros((1, output_dim)),
                                   name="y_mean",
                                   broadcastable=(True, False))
        y_std_var = theano.shared(np.ones((1, output_dim)),
                                  name="y_std",
                                  broadcastable=(True, False))

        normalized_xs_var = (xs_var - x_mean_var) / x_std_var
        normalized_ys_var = (ys_var - y_mean_var) / y_std_var

        normalized_means_var = L.get_output(
            l_mean, {mean_network.input_layer: normalized_xs_var})
        normalized_log_stds_var = L.get_output(
            l_log_std, {mean_network.input_layer: normalized_xs_var})

        means_var = normalized_means_var * y_std_var + y_mean_var
        log_stds_var = normalized_log_stds_var + TT.log(y_std_var)

        normalized_old_means_var = (old_means_var - y_mean_var) / y_std_var
        normalized_old_log_stds_var = old_log_stds_var - TT.log(y_std_var)

        dist = self._dist = DiagonalGaussian()

        normalized_dist_info_vars = dict(mean=normalized_means_var,
                                         log_std=normalized_log_stds_var)

        mean_kl = TT.mean(
            dist.kl_sym(
                dict(mean=normalized_old_means_var,
                     log_std=normalized_old_log_stds_var),
                normalized_dist_info_vars,
            ))

        loss = -TT.mean(
            dist.log_likelihood_sym(normalized_ys_var,
                                    normalized_dist_info_vars))

        self._f_predict = compile_function([xs_var], means_var)
        self._f_pdists = compile_function([xs_var], [means_var, log_stds_var])
        self._l_mean = l_mean
        self._l_log_std = l_log_std

        optimizer_args = dict(
            loss=loss,
            target=self,
            network_outputs=[normalized_means_var, normalized_log_stds_var],
        )

        if use_trust_region:
            optimizer_args["leq_constraint"] = (mean_kl, step_size)
            optimizer_args["inputs"] = [
                xs_var, ys_var, old_means_var, old_log_stds_var
            ]
        else:
            optimizer_args["inputs"] = [xs_var, ys_var]

        self._optimizer.update_opt(**optimizer_args)

        self._use_trust_region = use_trust_region
        self._name = name

        self._normalize_inputs = normalize_inputs
        self._normalize_outputs = normalize_outputs
        self._x_mean_var = x_mean_var
        self._x_std_var = x_std_var
        self._y_mean_var = y_mean_var
        self._y_std_var = y_std_var
    def __init__(
        self,
        input_shape,
        output_dim,
        predict_all=False,  # CF
        prob_network=None,
        hidden_sizes=(32, 32),
        hidden_nonlinearity=NL.rectify,
        optimizer=None,
        use_trust_region=True,
        step_size=0.01,
        normalize_inputs=True,
        name=None,
    ):
        """
        :param input_shape: Shape of the input data.
        :param output_dim: Dimension of output.
        :param hidden_sizes: Number of hidden units of each layer of the mean network.
        :param hidden_nonlinearity: Non-linearity used for each layer of the mean network.
        :param optimizer: Optimizer for minimizing the negative log-likelihood.
        :param use_trust_region: Whether to use trust region constraint.
        :param step_size: KL divergence constraint for each iteration
        """
        Serializable.quick_init(self, locals())

        if optimizer is None:
            if use_trust_region:
                optimizer = PenaltyLbfgsOptimizer()
            else:
                optimizer = LbfgsOptimizer()

        self.output_dim = output_dim
        self._optimizer = optimizer

        if prob_network is None:
            prob_network = GRUNetwork(
                input_shape=input_shape,
                output_dim=output_dim,
                hidden_dim=hidden_sizes[0],  # this gives 32 by default
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=NL.softmax,
            )

        l_prob = prob_network.output_layer

        LasagnePowered.__init__(self, [l_prob])

        xs_var = prob_network.input_layer.input_var
        ys_var = TT.itensor3("ys")
        old_prob_var = TT.tensor3("old_prob")

        x_mean_var = theano.shared(
            np.zeros(
                (
                    1,
                    1,
                ) + input_shape
            ),  # this syntax makes the shape (1,1,*input_shape,). The first is traj
            name="x_mean",
            broadcastable=(
                True,
                True,
            ) + (False, ) * len(input_shape))
        x_std_var = theano.shared(np.ones((
            1,
            1,
        ) + input_shape),
                                  name="x_std",
                                  broadcastable=(
                                      True,
                                      True,
                                  ) + (False, ) * len(input_shape))

        normalized_xs_var = (xs_var - x_mean_var) / x_std_var

        prob_var_all = L.get_output(
            l_prob, {prob_network.input_layer: normalized_xs_var})

        if predict_all:
            prob_var = prob_var_all
        else:
            # take only last dim but keep the shape
            prob_var_last = TT.reshape(
                prob_var_all[:, -1, :],
                (TT.shape(prob_var_all)[0], 1, TT.shape(prob_var_all)[2]))
            # padd along the time dimension to obtain the same shape as before
            padded_prob_var = TT.tile(prob_var_last,
                                      (1, TT.shape(prob_var_all)[1], 1))
            # give it the standard name
            prob_var = padded_prob_var

        old_info_vars = dict(prob=old_prob_var)
        info_vars = dict(prob=prob_var)

        dist = self._dist = Categorical(output_dim)

        mean_kl = TT.mean(dist.kl_sym(old_info_vars, info_vars))

        loss = -TT.mean(dist.log_likelihood_sym(ys_var, info_vars))

        predicted_flat = special.to_onehot_sym(
            TT.flatten(TT.argmax(prob_var, axis=-1)), output_dim)
        predicted = TT.reshape(predicted_flat, TT.shape(prob_var))

        self._f_predict = ext.compile_function([xs_var], predicted)
        self._f_prob = ext.compile_function([xs_var], prob_var)
        self._prob_network = prob_network
        self._l_prob = l_prob

        optimizer_args = dict(
            loss=loss,
            target=self,
            network_outputs=[prob_var],
        )

        if use_trust_region:
            optimizer_args["leq_constraint"] = (mean_kl, step_size)
            optimizer_args["inputs"] = [xs_var, ys_var, old_prob_var]
        else:
            optimizer_args["inputs"] = [xs_var, ys_var]

        self._optimizer.update_opt(**optimizer_args)

        self._use_trust_region = use_trust_region
        self._name = name

        self._normalize_inputs = normalize_inputs
        self._x_mean_var = x_mean_var
        self._x_std_var = x_std_var
Ejemplo n.º 9
0
const_test_rew_summary = []
rand_test_rew_summary = []
step_test_rew_summary = []
rand_step_test_rew_summary = []
adv_test_rew_summary = []
save_prefix = 'REINFORCE-BASELINE-env-{}_{}_Exp{}_Itr{}_BS{}_Adv{}_stp{}_lam{}'.format(
    env_name, adv_name, n_exps, n_itr, batch_size, adv_fraction, step_size,
    gae_lambda)
save_dir = './binu'
fig_dir = 'figs'
save_name = save_dir + '/' + save_prefix
fig_name = fig_dir + '/' + save_prefix + '.png'

optimizer_args = dict()
optimizer = PenaltyLbfgsOptimizer(**optimizer_args)

for ne in range(n_exps):
    ## Environment definition ##
    env = normalize(GymEnv(env_name, adv_fraction))
    ## Protagonist policy definition ##
    pro_policy = GaussianMLPPolicy(env_spec=env.spec,
                                   hidden_sizes=layer_size,
                                   is_protagonist=True)
    pro_baseline = LinearFeatureBaseline(env_spec=env.spec)

    ## Zero Adversary for the protagonist training ##
    zero_adv_policy = ConstantControlPolicy(env_spec=env.spec,
                                            is_protagonist=False,
                                            constant_val=0.0)
Ejemplo n.º 10
0
    def __init__(
            self,
            input_shape,
            output_dim,
            prob_network=None,
            hidden_sizes=(32, 32),
            hidden_nonlinearity=NL.rectify,
            optimizer=None,
            use_trust_region=True,
            step_size=0.01,
            normalize_inputs=True,
            name=None,
    ):
        """
        :param input_shape: Shape of the input data.
        :param output_dim: Dimension of output.
        :param hidden_sizes: Number of hidden units of each layer of the mean network.
        :param hidden_nonlinearity: Non-linearity used for each layer of the mean network.
        :param optimizer: Optimizer for minimizing the negative log-likelihood.
        :param use_trust_region: Whether to use trust region constraint.
        :param step_size: KL divergence constraint for each iteration
        """
        Serializable.quick_init(self, locals())

        if optimizer is None:
            if use_trust_region:
                optimizer = PenaltyLbfgsOptimizer()
            else:
                optimizer = LbfgsOptimizer()

        self.output_dim = output_dim
        self._optimizer = optimizer

        if prob_network is None:
            prob_network = MLP(
                input_shape=input_shape,
                output_dim=output_dim,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=NL.softmax,
            )

        l_prob = prob_network.output_layer

        LasagnePowered.__init__(self, [l_prob])

        xs_var = prob_network.input_layer.input_var
        ys_var = TT.imatrix("ys")
        old_prob_var = TT.matrix("old_prob")

        x_mean_var = theano.shared(
            np.zeros((1,) + input_shape),
            name="x_mean",
            broadcastable=(True,) + (False,) * len(input_shape)
        )
        x_std_var = theano.shared(
            np.ones((1,) + input_shape),
            name="x_std",
            broadcastable=(True,) + (False,) * len(input_shape)
        )

        normalized_xs_var = (xs_var - x_mean_var) / x_std_var

        prob_var = L.get_output(l_prob, {prob_network.input_layer: normalized_xs_var})

        old_info_vars = dict(prob=old_prob_var)
        info_vars = dict(prob=prob_var)

        dist = self._dist = Categorical(output_dim)

        mean_kl = TT.mean(dist.kl_sym(old_info_vars, info_vars))

        loss = - TT.mean(dist.log_likelihood_sym(ys_var, info_vars))

        predicted = special.to_onehot_sym(TT.argmax(prob_var, axis=1), output_dim)

        self._f_predict = ext.compile_function([xs_var], predicted)
        self._f_prob = ext.compile_function([xs_var], prob_var)
        self._prob_network = prob_network
        self._l_prob = l_prob

        optimizer_args = dict(
            loss=loss,
            target=self,
            network_outputs=[prob_var],
        )

        if use_trust_region:
            optimizer_args["leq_constraint"] = (mean_kl, step_size)
            optimizer_args["inputs"] = [xs_var, ys_var, old_prob_var]
        else:
            optimizer_args["inputs"] = [xs_var, ys_var]

        self._optimizer.update_opt(**optimizer_args)

        self._use_trust_region = use_trust_region
        self._name = name

        self._normalize_inputs = normalize_inputs
        self._x_mean_var = x_mean_var
        self._x_std_var = x_std_var
Ejemplo n.º 11
0
def run_task(v):
    expDict = v
    ###############################
    #Env
    if (expDict['isNormalized']):
        if (expDict['isGymEnv']):
            env = normalize(
                GymEnv(expDict['envName'],
                       record_video=False,
                       record_log=False))
        else:
            env = normalize(expDict['envName'])
        #if env is normalized then it is wrapped
        #dartEnv = env.wrapped_env.env.unwrapped
    else:  #if not normalized, needs to be gym environment
        env = GymEnv(expDict['envName'], record_video=False, record_log=False)

    if (expDict['blType'] == 'linear'):
        bl = LinearFeatureBaseline(env_spec=env.spec)
    elif (expDict['blType'] == 'MLP'):
        #use regressor_args as dict to define regressor arguments like layers
        regArgs = dict()
        regArgs['hidden_sizes'] = expDict['blMlpArch']
        #only used if adaptive_std == True
        regArgs['std_hidden_sizes'] = expDict['blMlpArch']
        #defaults to normalizing
        regArgs['normalize_inputs'] = False
        regArgs['normalize_outputs'] = False
        #regArgs['adaptive_std'] = True
        #regArgs['learn_std']= False  #ignored if adaptive_std == true - sets global value which is required for all thread instances
        bl = GaussianMLPBaseline(env_spec=env.spec, regressor_args=regArgs)
    else:
        print('unknown baseline type : ' + expDict['blType'])
        bl = None

    ###############################
    #Policy
    pol = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=expDict[
            'polNetArch']  #must be tuple - if only 1 value should be followed by comma i.e. (8,)
    )

    ###############################
    #RL Algorithm

    #allow for either trpo or ppo
    optimizerArgs = expDict['optimizerArgs']
    if optimizerArgs is None: optimizerArgs = dict()

    if expDict['useCG']:
        #either use CG optimizer == TRPO
        optimizer = ConjugateGradientOptimizer(**optimizerArgs)
        print('Using CG optimizer (TRPO)')
    #or use BFGS optimzier -> ppo? not really
    else:
        optimizer = PenaltyLbfgsOptimizer(**optimizerArgs)
        print('Using LBFGS optimizer (PPO-like ?)')
    #NPO is expecting in ctor :
    #self.optimizer = optimizer - need to specify this or else defaults to PenaltyLbfgsOptimizer
    #self.step_size = step_size : defaults to 0.01
    #truncate_local_is_ratio means to truncate distribution likelihood ration, which is defined as
    #  lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars)
    # if truncation is not none : lr = TT.minimum(self.truncate_local_is_ratio, lr)
    #self.truncate_local_is_ratio = truncate_local_is_ratio
    algo = NPO(optimizer=optimizer,
               env=env,
               policy=pol,
               baseline=bl,
               batch_size=int(expDict['numBatches']),
               whole_paths=True,
               gae_lambda=float(expDict['gae_lambda']),
               max_path_length=int(expDict['maxPathLength']),
               n_itr=int(expDict['numIters']),
               discount=0.99,
               step_size=0.01,
               start_itr=1)

    algo.train()
    def __init__(
        self,
        input_shape,
        output_dim,
        predict_all=False,
        hidden_sizes=(32, 32),
        hidden_nonlinearity=NL.rectify,
        optimizer=None,
        use_trust_region=True,
        step_size=0.01,
        normalize_inputs=True,
        name=None,
    ):
        """
        :param input_shape: Shape of the input data.
        :param output_dim: Dimension of output.
        :param predict_all: use the prediction made at every step about the latent variables (not only the last step)
        :param hidden_sizes: Number of hidden units of each layer of the mean network.
        :param hidden_nonlinearity: Non-linearity used for each layer of the mean network.
        :param optimizer: Optimizer for minimizing the negative log-likelihood.
        :param use_trust_region: Whether to use trust region constraint.
        :param step_size: KL divergence constraint for each iteration
        """
        Serializable.quick_init(self, locals())

        if optimizer is None:
            if use_trust_region:
                optimizer = PenaltyLbfgsOptimizer()
            else:
                optimizer = LbfgsOptimizer()

        self.output_dim = output_dim
        self._optimizer = optimizer

        p_network = GRUNetwork(
            input_shape=input_shape,
            output_dim=output_dim,
            hidden_dim=hidden_sizes[0],
            hidden_nonlinearity=hidden_nonlinearity,
            output_nonlinearity=NL.sigmoid,
        )

        l_p = p_network.output_layer  # this is every intermediate latent state! but I only care about last

        LasagnePowered.__init__(self, [l_p])

        xs_var = p_network.input_layer.input_var

        ys_var = TT.itensor3("ys")  # this is 3D: (traj, time, lat_dim)
        old_p_var = TT.tensor3("old_p")
        x_mean_var = theano.shared(np.zeros((
            1,
            1,
        ) + input_shape),
                                   name="x_mean",
                                   broadcastable=(
                                       True,
                                       True,
                                   ) + (False, ) * len(input_shape))

        x_std_var = theano.shared(np.ones((
            1,
            1,
        ) + input_shape),
                                  name="x_std",
                                  broadcastable=(
                                      True,
                                      True,
                                  ) + (False, ) * len(input_shape))

        normalized_xs_var = (xs_var - x_mean_var) / x_std_var
        # this is the previous p_var, from which I only want the last time-step padded along all time-steps
        p_var_all = L.get_output(l_p,
                                 {p_network.input_layer: normalized_xs_var})
        # take only last dim but keep the shape
        p_var_last = TT.reshape(
            p_var_all[:, -1, :],
            (TT.shape(p_var_all)[0], 1, TT.shape(p_var_all)[2]))
        # padd along the time dimension to obtain the same shape as before
        padded_p = TT.tile(p_var_last, (1, TT.shape(p_var_all)[1], 1))
        # give it the standard name
        if predict_all:
            p_var = p_var_all
        else:
            p_var = padded_p

        old_info_vars = dict(p=old_p_var)
        info_vars = dict(
            p=p_var
        )  # posterior of the latent at every step, wrt obs-act. Same along batch if recurrent

        dist = self._dist = Bernoulli(output_dim)

        mean_kl = TT.mean(dist.kl_sym(old_info_vars, info_vars))

        loss = -TT.mean(dist.log_likelihood_sym(
            ys_var,
            info_vars))  # regressor just wants to min -loglik of data ys

        predicted = p_var >= 0.5

        self._f_predict = ext.compile_function([xs_var], predicted)
        self._f_p = ext.compile_function(
            [xs_var], p_var
        )  # for consistency with gauss_mlp_reg this should be ._f_pdists

        self._l_p = l_p

        optimizer_args = dict(
            loss=loss,
            target=self,
            network_outputs=[p_var],
        )

        if use_trust_region:
            optimizer_args["leq_constraint"] = (mean_kl, step_size)
            optimizer_args["inputs"] = [xs_var, ys_var, old_p_var]
        else:
            optimizer_args["inputs"] = [xs_var, ys_var]

        self._optimizer.update_opt(**optimizer_args)

        self._use_trust_region = use_trust_region
        self._name = name

        self._normalize_inputs = normalize_inputs
        self._x_mean_var = x_mean_var
        self._x_std_var = x_std_var
Ejemplo n.º 13
0
    def __init__(
        self,
        input_shape,
        output_dim,
        predict_all=True,
        hidden_sizes=(32, 32),
        hidden_nonlinearity=NL.rectify,
        optimizer=None,
        use_trust_region=True,
        step_size=0.01,
        normalize_inputs=True,
        name=None,
    ):
        """
        :param input_shape: Shape of the input data.
        :param output_dim: Dimension of output.
        :param hidden_sizes: Number of hidden units of each layer of the mean network.
        :param hidden_nonlinearity: Non-linearity used for each layer of the mean network.
        :param optimizer: Optimizer for minimizing the negative log-likelihood.
        :param use_trust_region: Whether to use trust region constraint.
        :param step_size: KL divergence constraint for each iteration
        """
        Serializable.quick_init(self, locals())

        if optimizer is None:
            if use_trust_region:
                optimizer = PenaltyLbfgsOptimizer()
            else:
                optimizer = LbfgsOptimizer()

        self.output_dim = output_dim
        self._optimizer = optimizer

        p_network = MLP(
            input_shape=input_shape,
            output_dim=output_dim,
            hidden_sizes=hidden_sizes,
            hidden_nonlinearity=hidden_nonlinearity,
            output_nonlinearity=NL.sigmoid,
        )

        l_p = p_network.output_layer

        LasagnePowered.__init__(self, [l_p])

        xs_var = p_network.input_layer.input_var

        ys_var = TT.imatrix("ys")
        old_p_var = TT.matrix("old_p")
        x_mean_var = theano.shared(np.zeros((1, ) + input_shape),
                                   name="x_mean",
                                   broadcastable=(True, ) +
                                   (False, ) * len(input_shape))
        x_std_var = theano.shared(np.ones((1, ) + input_shape),
                                  name="x_std",
                                  broadcastable=(True, ) +
                                  (False, ) * len(input_shape))

        normalized_xs_var = (xs_var - x_mean_var) / x_std_var

        p_var = L.get_output(l_p, {p_network.input_layer: normalized_xs_var})

        old_info_vars = dict(p=old_p_var)
        info_vars = dict(
            p=p_var
        )  # posterior of the latent at every step, wrt obs-act. Same along batch if recurrent

        dist = self._dist = Bernoulli(output_dim)

        mean_kl = TT.mean(dist.kl_sym(old_info_vars, info_vars))
        self._mean_kl = ext.compile_function(
            [xs_var, old_p_var], mean_kl)  # if not using TR, still log KL

        loss = -TT.mean(dist.log_likelihood_sym(
            ys_var,
            info_vars))  # regressor just wants to min -loglik of data ys

        predicted = p_var >= 0.5  # this gives 0 or 1, depending what is closer to the p_var

        self._f_predict = ext.compile_function([xs_var], predicted)
        self._f_p = ext.compile_function(
            [xs_var], p_var
        )  # for consistency with gauss_mlp_reg this should be ._f_pdists
        self._l_p = l_p

        optimizer_args = dict(
            loss=loss,
            target=self,
            network_outputs=[p_var],
        )

        if use_trust_region:
            optimizer_args["leq_constraint"] = (mean_kl, step_size)
            optimizer_args["inputs"] = [xs_var, ys_var, old_p_var]
        else:
            optimizer_args["inputs"] = [xs_var, ys_var]

        self._optimizer.update_opt(**optimizer_args)

        self._use_trust_region = use_trust_region
        self._name = name

        self._normalize_inputs = normalize_inputs
        self._x_mean_var = x_mean_var
        self._x_std_var = x_std_var
Ejemplo n.º 14
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--resume_from', type=str)
    parser.add_argument('--encoding_levels', type=int, nargs='+')
    parser.add_argument('--num_encoding_levels', type=int, default=5)
    parser.add_argument('--conv_filters',
                        nargs='*',
                        type=int,
                        default=[16, 16])
    parser.add_argument('--conv_filter_sizes',
                        nargs='*',
                        type=int,
                        default=[4, 4])
    parser.add_argument('--conv_strides', nargs='*', type=int, default=[2, 2])
    parser.add_argument('--hidden_sizes',
                        nargs='*',
                        type=int,
                        default=[32, 32])
    parser.add_argument('--init_std', type=float, default=1.0)
    parser.add_argument('--n_itr', type=int, default=500)
    parser.add_argument('--step_size', type=float, default=0.01)
    parser.add_argument('--batch_size', type=int, default=4000)
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--custom_local_flags', type=str, default=None)
    args = parser.parse_args()

    np.random.seed(args.seed)

    env = SimpleQuadPanda3dEnv(action_space=TranslationAxisAngleSpace(
        low=[-10., -10., -10., -1.5707963267948966],
        high=[10., 10., 10., 1.5707963267948966],
        axis=[0., 0., 1.]),
                               sensor_names=['image'],
                               camera_size=[256, 256],
                               camera_hfov=26.007823885645635,
                               car_env_class=GeometricCarPanda3dEnv,
                               car_action_space=BoxSpace(low=[0., 0.],
                                                         high=[0., 0.]),
                               car_model_names=[
                                   'mazda6', 'chevrolet_camaro',
                                   'nissan_gt_r_nismo',
                                   'lamborghini_aventador', 'golf5'
                               ],
                               dt=0.1)
    env = ServoingEnv(env)
    transformers = {
        'image':
        CompositionTransformer([
            ImageTransformer(scale_size=0.5),
            OpsTransformer(transpose=(2, 0, 1))
        ]),
        'action':
        NormalizerTransformer(space=env.action_space)
    }
    env = RllabEnv(env, transformers=transformers)
    env = normalize(env)

    assert len(args.conv_filters) == len(args.conv_filter_sizes)
    assert len(args.conv_filters) == len(args.conv_strides)
    network_kwargs = dict(encoding_levels=args.encoding_levels,
                          num_encoding_levels=args.num_encoding_levels,
                          conv_filters=args.conv_filters,
                          conv_filter_sizes=args.conv_filter_sizes,
                          conv_strides=args.conv_strides,
                          conv_pads=[0] * len(args.conv_filters),
                          hidden_sizes=args.hidden_sizes,
                          hidden_nonlinearity=LN.rectify,
                          output_nonlinearity=None,
                          name="mean_network")
    mean_network = VggConvNetwork(input_shape=env.observation_space.shape,
                                  output_dim=env.action_space.flat_dim,
                                  **network_kwargs)

    policy = GaussianConvPolicy(
        env_spec=env.spec,
        init_std=args.init_std,
        mean_network=mean_network,
    )

    conv_baseline_kwargs = dict(
        env_spec=env.spec,
        regressor_args=dict(
            mean_network=VggConvNetwork(
                input_shape=env.observation_space.shape,
                output_dim=1,
                **network_kwargs),
            use_trust_region=True,
            step_size=args.step_size,
            normalize_inputs=True,
            normalize_outputs=True,
            hidden_sizes=None,
            conv_filters=None,
            conv_filter_sizes=None,
            conv_strides=None,
            conv_pads=None,
            batchsize=200,
            optimizer=PenaltyLbfgsOptimizer(n_slices=50),
        ))
    baseline = GaussianConvBaseline(**conv_baseline_kwargs)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=args.batch_size,
        max_path_length=100,
        n_itr=args.n_itr,
        discount=0.9,
        step_size=args.step_size,
        optimizer=ConjugateGradientOptimizer(num_slices=50),
    )

    if args.resume_from:
        run_experiment_lite(algo.train(),
                            snapshot_mode='gap',
                            snapshot_gap=10,
                            seed=args.seed,
                            custom_local_flags=args.custom_local_flags,
                            resume_from=args.resume_from)
    else:
        run_experiment_lite(algo.train(),
                            snapshot_mode='gap',
                            snapshot_gap=10,
                            seed=args.seed,
                            custom_local_flags=args.custom_local_flags)