def acquire_MLP(self):
        """
        Our MLP uses a tanh to decide on the control input of the system.

        """
        self._mlp = MLP(layers=[self.state_size] + self.internal_layers +
                        [self.policy_size],
                        dropout=self.dropout)
        mlp_params, mlp_x, mlp_prediction, mlp_prediction_dropout = \
                self._mlp.get()
        for param_set in mlp_params:
            self.params.extend(param_set)

        self._prediction = self.boundsify(mlp_prediction)
        self._prediction_dropout = self.boundsify(mlp_prediction_dropout)
        self._x = mlp_x
    def acquire_MLP(self):
        """
        Our MLP uses a tanh to decide on the control input of the system.

        """
        self._mlp = MLP(layers = [self.state_size] + self.internal_layers + [self.policy_size],
                    dropout=self.dropout)
        mlp_params, mlp_x, mlp_prediction, mlp_prediction_dropout = \
                self._mlp.get()
        for param_set in mlp_params:
            self.params.extend(param_set)
        
        self._prediction = self.boundsify(mlp_prediction)
        self._prediction_dropout = self.boundsify(mlp_prediction_dropout)
        self._x = mlp_x
class PolicyModel(object):
    """
    Use a Multi Layer Perceptron to discover a
    policy for a dynamics by gradient descending
    over a single timestep.
    
    """
    @classmethod
    def from_plant(cls, plant, **kwargs):
        return cls(state_size=plant.num_states,
                   policy_size=plant.num_controls,
                   policy_bounds=plant.control_bounds,
                   dynamics=plant.theano_dynamics,
                   **kwargs)

    def __init__(self,
                 state_size=2,
                 policy_size=1,
                 policy_bounds=(-40, 40),
                 dynamics=None,
                 policy_laziness=0.0000001,
                 internal_layers=[],
                 learning_rate=0.01,
                 dropout=0.0,
                 allow_input_downcast=True):
        # constatants
        if dynamics is None:
            raise ValueError("Dynamics must be specified")
        self.dynamics = dynamics
        self.state_size = state_size
        self.policy_size = policy_size
        self.policy_bounds = policy_bounds
        self.internal_layers = internal_layers
        self.allow_input_downcast = allow_input_downcast
        self.learning_rate = theano.shared(
            np.float64(learning_rate).astype(floatX), name='learning_rate')
        self.dropout = dropout

        self.policy_laziness = theano.shared(
            np.float64(policy_laziness).astype(floatX), name='policy_laziness')

        self.predict = {}
        self.teleportation = {}
        self.cost = {}
        self.target_value = {}
        self.cost_fun = {}
        # intialization functions
        self.create_variables()
        self.create_misc_functions('test', self._prediction)
        self.create_misc_functions('train', self._prediction_dropout)
        self.create_update_fun()

    def boundsify(self, net_output):
        zero_one = (net_output[0] + 1.0) / 2.0
        u_min, u_max = self.policy_bounds
        return u_min + zero_one * (u_max - u_min)

    def acquire_MLP(self):
        """
        Our MLP uses a tanh to decide on the control input of the system.

        """
        self._mlp = MLP(layers=[self.state_size] + self.internal_layers +
                        [self.policy_size],
                        dropout=self.dropout)
        mlp_params, mlp_x, mlp_prediction, mlp_prediction_dropout = \
                self._mlp.get()
        for param_set in mlp_params:
            self.params.extend(param_set)

        self._prediction = self.boundsify(mlp_prediction)
        self._prediction_dropout = self.boundsify(mlp_prediction_dropout)
        self._x = mlp_x

    def create_variables(self):
        self.params = []
        self.acquire_MLP()

    def create_misc_functions(self, name, prediction):
        self.predict[name] = theano.function(
            [self._x],
            prediction,
            allow_input_downcast=self.allow_input_downcast)

        self.target_value[name] = T.vector()
        new_x = self.dynamics(self._x, prediction)
        self.teleportation[name] = theano.function(
            [self._x], new_x, allow_input_downcast=self.allow_input_downcast)

        self.cost[name] = T.sum((new_x - self.target_value[name])**2)
        self.cost_fun[name] = theano.function(
            [self._x, self.target_value[name]],
            self.cost[name],
            allow_input_downcast=self.allow_input_downcast)

    def create_update_fun(self):
        gparams = T.grad(self.cost['train'], self.params)

        self.grad_fun = theano.function(
            [self._x, self.target_value['train']],
            gparams,
            allow_input_downcast=self.allow_input_downcast)

        updates = OrderedDict()
        for gparam, param in zip(gparams, self.params):
            updates[param] = param - gparam * self.learning_rate

        self.update_fun = theano.function(
            [self._x, self.target_value['train']],
            self.cost['train'],
            allow_input_downcast=self.allow_input_downcast,
            updates=updates)

    def set_learning_rate(self, rate):
        self.learning_rate.set_value(np.float32(rate))

    def reset_weights(self):
        for param in self.params:
            param.set_value((
                np.random.standard_normal(param.get_value(borrow=True).shape) *
                (1. / param.get_value(borrow=True).shape[0])).astype(floatX))

    def controller(self):
        def c(x, t):
            if any(np.isnan(x)) or not all(np.abs(x) < 1e100):
                return 0.0
            prediction = self.predict['test'](np.array(x))
            return prediction

        return c
class PolicyModel(object):
    """
    Use a Multi Layer Perceptron to discover a
    policy for a dynamics by gradient descending
    over a single timestep.
    
    """
    @classmethod
    def from_plant(cls, plant, **kwargs):
        return cls(state_size=plant.num_states,
                           policy_size=plant.num_controls,
                           policy_bounds=plant.control_bounds,
                           dynamics=plant.theano_dynamics,
                           **kwargs)

    def __init__(self, state_size = 2,
                       policy_size = 1,
                       policy_bounds = (-40,40),
                       dynamics = None,
                       policy_laziness = 0.0000001,
                       internal_layers = [],
                       learning_rate = 0.01,
                       dropout = 0.0,
                       allow_input_downcast=True):
        # constatants
        if dynamics is None:
          raise ValueError("Dynamics must be specified")
        self.dynamics = dynamics
        self.state_size = state_size
        self.policy_size = policy_size
        self.policy_bounds = policy_bounds
        self.internal_layers = internal_layers
        self.allow_input_downcast = allow_input_downcast
        self.learning_rate = theano.shared(np.float64(learning_rate).astype(floatX), name='learning_rate')
        self.dropout = dropout

        self.policy_laziness = theano.shared(np.float64(policy_laziness).astype(floatX), name='policy_laziness')

        self.predict = {}
        self.teleportation = {}
        self.cost = {}
        self.target_value = {}
        self.cost_fun = {}
        # intialization functions
        self.create_variables()
        self.create_misc_functions('test', self._prediction)
        self.create_misc_functions('train', self._prediction_dropout)
        self.create_update_fun()

    def boundsify(self, net_output):
        zero_one = (net_output[0] + 1.0) / 2.0
        u_min, u_max = self.policy_bounds
        return u_min + zero_one * (u_max-u_min)

    def acquire_MLP(self):
        """
        Our MLP uses a tanh to decide on the control input of the system.

        """
        self._mlp = MLP(layers = [self.state_size] + self.internal_layers + [self.policy_size],
                    dropout=self.dropout)
        mlp_params, mlp_x, mlp_prediction, mlp_prediction_dropout = \
                self._mlp.get()
        for param_set in mlp_params:
            self.params.extend(param_set)
        
        self._prediction = self.boundsify(mlp_prediction)
        self._prediction_dropout = self.boundsify(mlp_prediction_dropout)
        self._x = mlp_x

    def create_variables(self):
        self.params = []
        self.acquire_MLP()

    def create_misc_functions(self, name, prediction):
        self.predict[name] = theano.function([self._x],
                                             prediction,
                                             allow_input_downcast=self.allow_input_downcast)

        self.target_value[name] = T.vector()
        new_x = self.dynamics( self._x, prediction )
        self.teleportation[name] = theano.function([self._x],
                                                   new_x,
                                                   allow_input_downcast=self.allow_input_downcast)

        self.cost[name] = T.sum((new_x - self.target_value[name])**2)
        self.cost_fun[name] = theano.function([self._x, self.target_value[name]],
                                              self.cost[name],
                                              allow_input_downcast=self.allow_input_downcast)

    def create_update_fun(self):
        gparams = T.grad(self.cost['train'], self.params)

        self.grad_fun = theano.function([self._x, self.target_value['train']],
                                        gparams,
                                        allow_input_downcast=self.allow_input_downcast)

        updates=OrderedDict()
        for gparam, param in zip(gparams, self.params):
            updates[param] = param - gparam * self.learning_rate

        self.update_fun = theano.function([self._x, self.target_value['train']],
                                          self.cost['train'],
                                          allow_input_downcast=self.allow_input_downcast,
                                          updates = updates)
    def set_learning_rate(self, rate):
        self.learning_rate.set_value(np.float32(rate))

    def reset_weights(self):
        for param in self.params:
            param.set_value(
                (np.random.standard_normal(param.get_value(borrow=True).shape) *
                (1./param.get_value(borrow=True).shape[0])).astype(floatX))

    def controller(self):
        def c(x,t):
            if any(np.isnan(x)) or not all(np.abs(x) < 1e100):
                return 0.0
            prediction = self.predict['test'](np.array(x))
            return prediction
        return c