コード例 #1
0
 def __init__(self, state_dim, control_dim, num_basis_functions, max_action=None):
     MGPR.__init__(self,
         np.random.randn(num_basis_functions, state_dim),
         0.1*np.random.randn(num_basis_functions, control_dim)
     )
     for model in self.models:
         model.kern.variance = 1.0
         model.kern.variance.trainable = False
         self.max_action = max_action
コード例 #2
0
ファイル: pilco.py プロジェクト: jastfkjg/RL-test
    def __init__(self, X, Y, num_induced_points=None, controller=None,
                reward=None, m_init=None, S_init=None, name=None, debug=False):
        # super(PILCO, self).__init__(name)
        if not num_induced_points:      # num_induced_points ?
            self.mgpr = MGPR(X, Y)
        else:
            self.mgpr = SMGPR(X, Y, num_induced_points)
        self.state_dim = Y.shape[1]
        self.control_dim = X.shape[1] - Y.shape[1]

        self.sess = gpflow.get_default_session()
        if debug:
            self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess)
        # self.sess.run(tf.global_variables_initializer())

        if controller is None:   # the policy  - to change
            print("controller cannot be None")
        else:
            self.controller = controller

        if reward is None:     # reward function
            self.reward = Reward()
        else:
            self.reward = reward
        
        if m_init is None or S_init is None:
            # If the user has not provided an initial state for the rollouts,
            # then define it as the first state in the dataset.
            self.m_init = X[0:1, 0:self.state_dim]
            self.S_init = np.diag(np.ones(self.state_dim) * 0.1)  # variance
        else:
            self.m_init = m_init
            self.S_init = S_init
コード例 #3
0
ファイル: pilco.py プロジェクト: AZdet/pilco_lynxmotion
    def __init__(self, X, Y, horizon=30, m_init=None, S_init=None):
        self.mgpr = MGPR(X, Y)

        self.state_dim = Y.shape[1]
        self.control_dim = X.shape[1] - Y.shape[1]
        self.horizon = horizon

        self.controller = LinearController(self.state_dim, self.control_dim)
        self.reward = ExponentialReward(self.state_dim)

        if m_init is None or S_init is None:
            # default initial state for the rollouts is the first state in the dataset.
            self.m_init = X[0:1, 0:self.state_dim]
            self.S_init = np.diag(np.ones(self.state_dim) * 0.1)
        else:
            self.m_init = m_init
            self.S_init = S_init
        self.m_init = torch.tensor(self.m_init, dtype=torch.float32)
        self.S_init = torch.tensor(self.S_init, dtype=torch.float32)
        self.optimizer = torch.optim.Adam(self.controller.parameters())
コード例 #4
0
 def __init__(self,
              X,
              Y,
              num_induced_points=None,
              horizon=100,
              controller=None,
              reward=None,
              m_init=None,
              S_init=None,
              name=None):
     super(PILCO, self).__init__(name)
     self.mgpr = MGPR(X, Y)
     self.state_dim = Y.shape[1]
     self.control_dim = X.shape[1] - Y.shape[1]
     self.horizon = horizon
     self.controller = controller
     self.reward = reward
     self.m_init = X[0:1, 0:self.state_dim]
     self.S_init = np.diag(np.ones(self.state_dim) * 0.1)
     self.optimizer = None
コード例 #5
0
ファイル: pilco.py プロジェクト: AZdet/pilco_lynxmotion
class PILCO:
    def __init__(self, X, Y, horizon=30, m_init=None, S_init=None):
        self.mgpr = MGPR(X, Y)

        self.state_dim = Y.shape[1]
        self.control_dim = X.shape[1] - Y.shape[1]
        self.horizon = horizon

        self.controller = LinearController(self.state_dim, self.control_dim)
        self.reward = ExponentialReward(self.state_dim)

        if m_init is None or S_init is None:
            # default initial state for the rollouts is the first state in the dataset.
            self.m_init = X[0:1, 0:self.state_dim]
            self.S_init = np.diag(np.ones(self.state_dim) * 0.1)
        else:
            self.m_init = m_init
            self.S_init = S_init
        self.m_init = torch.tensor(self.m_init, dtype=torch.float32)
        self.S_init = torch.tensor(self.S_init, dtype=torch.float32)
        self.optimizer = torch.optim.Adam(self.controller.parameters())

    def optimize_models(self, maxiter=200):
        '''
        Optimize GP models
        '''
        self.mgpr.optimize(max_iter=maxiter)
        self.mgpr.eval()
        # print learned dynamics model parameters
        lengthscales = {}
        variances = {}
        noises = {}
        i = 0
        for model in self.mgpr.models:
            lengthscales[
                'GP' +
                str(i)] = model.covar_module.base_kernel.lengthscale.detach(
                ).numpy().ravel()
            variances['GP' + str(i)] = np.array(
                [model.covar_module.outputscale.item()])
            noises['GP' + str(i)] = np.array([model.likelihood.noise.item()])
            i += 1
        print('-----Learned models------')
        pd.set_option('precision', 3)
        print('---Lengthscales---')
        print(pd.DataFrame(data=lengthscales))
        print('---Variances---')
        print(pd.DataFrame(data=variances))
        print('---Noises---')
        print(pd.DataFrame(data=noises))

    def optimize_policy(self, maxiter=50):
        '''
        Optimize controller's parameter's
        '''
        self.mgpr.eval()
        start = time.time()
        for i in range(maxiter):
            self.optimizer.zero_grad()
            reward = self.compute_reward()  # policy evaluation
            loss = -reward
            loss.backward()  # policy improvement by policy gradient
            self.optimizer.step()
        end = time.time()
        print(
            "Controller's optimization: done in %.1f seconds with reward=%.3f."
            % (end - start, self.compute_reward()))

    def compute_action(self, x_m):
        x_m = torch.tensor(x_m, dtype=torch.float32)
        x_s = torch.zeros((self.state_dim, self.state_dim),
                          dtype=torch.float32)
        return self.controller((x_m, x_s))[0].detach().numpy()

    def predict(self, m_x, s_x, n):
        '''
        predict n steps with learned model
        '''
        reward = 0
        for _ in range(n):
            m_x, s_x = self.propagate(m_x, s_x)
            reward += self.reward.compute_reward(m_x, s_x)
        return m_x, s_x, reward

    def propagate(self, m_x, s_x):
        ''' 
        propagate from one state distribution to the next one with controller and GP models
        '''
        # from state x to control u
        m_u, s_u, c_xu = self.controller((m_x, s_x))
        # joint distribution of x and u
        m = torch.cat([m_x, m_u], axis=1)
        s1 = torch.cat([s_x, s_x @ c_xu], axis=1)
        s2 = torch.cat([(s_x @ c_xu).T, s_u], axis=1)
        s = torch.cat([s1, s2], axis=0)
        # go to next state by moment matching
        M_dx, S_dx, C_dx = self.mgpr.predict_on_noisy_inputs(m, s)
        M_x = M_dx + m_x

        S_x = S_dx + s_x + s1 @ C_dx + C_dx.T @ s1.T

        M_x.reshape(1, self.state_dim)
        S_x.reshape(self.state_dim, self.state_dim)
        return M_x, S_x

    def compute_reward(self):
        reward = self.predict(self.m_init, self.S_init, self.horizon)[2]
        return reward
コード例 #6
0
 def __init__(self, X, Y, num_induced_points, name=None):
     gpflow.Parameterized.__init__(self, name)
     self.num_induced_points = num_induced_points
     MGPR.__init__(self, X, Y, name)
コード例 #7
0
class PILCO(gpflow.models.Model):
    def __init__(self,
                 X,
                 Y,
                 num_induced_points=None,
                 horizon=100,
                 controller=None,
                 reward=None,
                 m_init=None,
                 S_init=None,
                 name=None):
        super(PILCO, self).__init__(name)
        self.mgpr = MGPR(X, Y)
        self.state_dim = Y.shape[1]
        self.control_dim = X.shape[1] - Y.shape[1]
        self.horizon = horizon
        self.controller = controller
        self.reward = reward
        self.m_init = X[0:1, 0:self.state_dim]
        self.S_init = np.diag(np.ones(self.state_dim) * 0.1)
        self.optimizer = None

    @gpflow.name_scope('likelihood')
    def _build_likelihood(self):
        # This is for tuning controller's parameters
        reward = self.predict(self.m_init, self.S_init, self.horizon)[2]
        return reward

    def optimize_models(self, maxiter=200, restarts=1):
        '''
        Optimize GP models
        '''
        self.mgpr.optimize(restarts=restarts)
        # Print the resulting model parameters
        lengthscales = {}
        variances = {}
        noises = {}
        i = 0
        for model in self.mgpr.models:
            lengthscales['GP' + str(i)] = model.kern.lengthscales.value
            variances['GP' + str(i)] = np.array([model.kern.variance.value])
            noises['GP' + str(i)] = np.array([model.likelihood.variance.value])
            i += 1
        print('-----Learned models------')
        pd.set_option('precision', 3)
        print('---Lengthscales---')
        print(pd.DataFrame(data=lengthscales))
        print('---Variances---')
        print(pd.DataFrame(data=variances))
        print('---Noises---')
        print(pd.DataFrame(data=noises))

    def optimize_policy(self, maxiter=30, restarts=0):
        '''
        Optimize controller's parameter's
        '''
        start = time.time()
        if not self.optimizer:
            self.optimizer = gpflow.train.ScipyOptimizer(method="L-BFGS-B")
            start = time.time()
            self.optimizer.minimize(self, maxiter=maxiter)
            end = time.time()
            print(
                "Controller's optimization: done in %.1f seconds with reward=%.3f."
                % (end - start, self.compute_reward()))
        session = self.optimizer._model.enquire_session(None)
        start = time.time()
        self.optimizer._optimizer.minimize(
            session=session,
            feed_dict=self.optimizer._gen_feed_dict(self.optimizer._model,
                                                    None),
            step_callback=None)
        end = time.time()
        print(
            "Controller's optimization: done in %.1f seconds with reward=%.3f."
            % (end - start, self.compute_reward()))
        best_parameters = self.read_values(session=session)
        self.assign(best_parameters)

    @gpflow.autoflow((float_type, [None, None]))
    def compute_action(self, x_m):
        return self.controller.compute_action(
            x_m, tf.zeros([self.state_dim, self.state_dim], float_type))[0]

    def predict(self, m_x, s_x, n):
        loop_vars = [
            tf.constant(0, tf.int32), m_x, s_x,
            tf.constant([[0]], float_type)
        ]

        _, m_x, s_x, reward = tf.while_loop(
            # Termination condition
            lambda j, m_x, s_x, reward: j < n,
            # Body function
            lambda j, m_x, s_x, reward:
            (j + 1, *self.propagate(m_x, s_x),
             tf.add(reward,
                    self.reward.compute_reward(m_x, s_x)[0])),
            loop_vars)

        return m_x, s_x, reward

    def propagate(self, m_x, s_x):
        m_u, s_u, c_xu = self.controller.compute_action(m_x, s_x)

        m = tf.concat([m_x, m_u], axis=1)
        s1 = tf.concat([s_x, s_x @ c_xu], axis=1)
        s2 = tf.concat([tf.transpose(s_x @ c_xu), s_u], axis=1)
        s = tf.concat([s1, s2], axis=0)

        M_dx, S_dx, C_dx = self.mgpr.predict_on_noisy_inputs(m, s)
        M_x = M_dx + m_x
        S_x = S_dx + s_x + s1 @ C_dx + tf.matmul(
            C_dx, s1, transpose_a=True, transpose_b=True)

        # While-loop requires the shapes of the outputs to be fixed
        M_x.set_shape([1, self.state_dim])
        S_x.set_shape([self.state_dim, self.state_dim])
        return M_x, S_x

    @gpflow.autoflow()
    def compute_reward(self):
        return self._build_likelihood()