コード例 #1
0
ファイル: pilco.py プロジェクト: AZdet/pilco_lynxmotion
class PILCO:
    def __init__(self, X, Y, horizon=30, m_init=None, S_init=None):
        self.mgpr = MGPR(X, Y)

        self.state_dim = Y.shape[1]
        self.control_dim = X.shape[1] - Y.shape[1]
        self.horizon = horizon

        self.controller = LinearController(self.state_dim, self.control_dim)
        self.reward = ExponentialReward(self.state_dim)

        if m_init is None or S_init is None:
            # default initial state for the rollouts is the first state in the dataset.
            self.m_init = X[0:1, 0:self.state_dim]
            self.S_init = np.diag(np.ones(self.state_dim) * 0.1)
        else:
            self.m_init = m_init
            self.S_init = S_init
        self.m_init = torch.tensor(self.m_init, dtype=torch.float32)
        self.S_init = torch.tensor(self.S_init, dtype=torch.float32)
        self.optimizer = torch.optim.Adam(self.controller.parameters())

    def optimize_models(self, maxiter=200):
        '''
        Optimize GP models
        '''
        self.mgpr.optimize(max_iter=maxiter)
        self.mgpr.eval()
        # print learned dynamics model parameters
        lengthscales = {}
        variances = {}
        noises = {}
        i = 0
        for model in self.mgpr.models:
            lengthscales[
                'GP' +
                str(i)] = model.covar_module.base_kernel.lengthscale.detach(
                ).numpy().ravel()
            variances['GP' + str(i)] = np.array(
                [model.covar_module.outputscale.item()])
            noises['GP' + str(i)] = np.array([model.likelihood.noise.item()])
            i += 1
        print('-----Learned models------')
        pd.set_option('precision', 3)
        print('---Lengthscales---')
        print(pd.DataFrame(data=lengthscales))
        print('---Variances---')
        print(pd.DataFrame(data=variances))
        print('---Noises---')
        print(pd.DataFrame(data=noises))

    def optimize_policy(self, maxiter=50):
        '''
        Optimize controller's parameter's
        '''
        self.mgpr.eval()
        start = time.time()
        for i in range(maxiter):
            self.optimizer.zero_grad()
            reward = self.compute_reward()  # policy evaluation
            loss = -reward
            loss.backward()  # policy improvement by policy gradient
            self.optimizer.step()
        end = time.time()
        print(
            "Controller's optimization: done in %.1f seconds with reward=%.3f."
            % (end - start, self.compute_reward()))

    def compute_action(self, x_m):
        x_m = torch.tensor(x_m, dtype=torch.float32)
        x_s = torch.zeros((self.state_dim, self.state_dim),
                          dtype=torch.float32)
        return self.controller((x_m, x_s))[0].detach().numpy()

    def predict(self, m_x, s_x, n):
        '''
        predict n steps with learned model
        '''
        reward = 0
        for _ in range(n):
            m_x, s_x = self.propagate(m_x, s_x)
            reward += self.reward.compute_reward(m_x, s_x)
        return m_x, s_x, reward

    def propagate(self, m_x, s_x):
        ''' 
        propagate from one state distribution to the next one with controller and GP models
        '''
        # from state x to control u
        m_u, s_u, c_xu = self.controller((m_x, s_x))
        # joint distribution of x and u
        m = torch.cat([m_x, m_u], axis=1)
        s1 = torch.cat([s_x, s_x @ c_xu], axis=1)
        s2 = torch.cat([(s_x @ c_xu).T, s_u], axis=1)
        s = torch.cat([s1, s2], axis=0)
        # go to next state by moment matching
        M_dx, S_dx, C_dx = self.mgpr.predict_on_noisy_inputs(m, s)
        M_x = M_dx + m_x

        S_x = S_dx + s_x + s1 @ C_dx + C_dx.T @ s1.T

        M_x.reshape(1, self.state_dim)
        S_x.reshape(self.state_dim, self.state_dim)
        return M_x, S_x

    def compute_reward(self):
        reward = self.predict(self.m_init, self.S_init, self.horizon)[2]
        return reward
コード例 #2
0
class PILCO(gpflow.models.Model):
    def __init__(self,
                 X,
                 Y,
                 num_induced_points=None,
                 horizon=100,
                 controller=None,
                 reward=None,
                 m_init=None,
                 S_init=None,
                 name=None):
        super(PILCO, self).__init__(name)
        self.mgpr = MGPR(X, Y)
        self.state_dim = Y.shape[1]
        self.control_dim = X.shape[1] - Y.shape[1]
        self.horizon = horizon
        self.controller = controller
        self.reward = reward
        self.m_init = X[0:1, 0:self.state_dim]
        self.S_init = np.diag(np.ones(self.state_dim) * 0.1)
        self.optimizer = None

    @gpflow.name_scope('likelihood')
    def _build_likelihood(self):
        # This is for tuning controller's parameters
        reward = self.predict(self.m_init, self.S_init, self.horizon)[2]
        return reward

    def optimize_models(self, maxiter=200, restarts=1):
        '''
        Optimize GP models
        '''
        self.mgpr.optimize(restarts=restarts)
        # Print the resulting model parameters
        lengthscales = {}
        variances = {}
        noises = {}
        i = 0
        for model in self.mgpr.models:
            lengthscales['GP' + str(i)] = model.kern.lengthscales.value
            variances['GP' + str(i)] = np.array([model.kern.variance.value])
            noises['GP' + str(i)] = np.array([model.likelihood.variance.value])
            i += 1
        print('-----Learned models------')
        pd.set_option('precision', 3)
        print('---Lengthscales---')
        print(pd.DataFrame(data=lengthscales))
        print('---Variances---')
        print(pd.DataFrame(data=variances))
        print('---Noises---')
        print(pd.DataFrame(data=noises))

    def optimize_policy(self, maxiter=30, restarts=0):
        '''
        Optimize controller's parameter's
        '''
        start = time.time()
        if not self.optimizer:
            self.optimizer = gpflow.train.ScipyOptimizer(method="L-BFGS-B")
            start = time.time()
            self.optimizer.minimize(self, maxiter=maxiter)
            end = time.time()
            print(
                "Controller's optimization: done in %.1f seconds with reward=%.3f."
                % (end - start, self.compute_reward()))
        session = self.optimizer._model.enquire_session(None)
        start = time.time()
        self.optimizer._optimizer.minimize(
            session=session,
            feed_dict=self.optimizer._gen_feed_dict(self.optimizer._model,
                                                    None),
            step_callback=None)
        end = time.time()
        print(
            "Controller's optimization: done in %.1f seconds with reward=%.3f."
            % (end - start, self.compute_reward()))
        best_parameters = self.read_values(session=session)
        self.assign(best_parameters)

    @gpflow.autoflow((float_type, [None, None]))
    def compute_action(self, x_m):
        return self.controller.compute_action(
            x_m, tf.zeros([self.state_dim, self.state_dim], float_type))[0]

    def predict(self, m_x, s_x, n):
        loop_vars = [
            tf.constant(0, tf.int32), m_x, s_x,
            tf.constant([[0]], float_type)
        ]

        _, m_x, s_x, reward = tf.while_loop(
            # Termination condition
            lambda j, m_x, s_x, reward: j < n,
            # Body function
            lambda j, m_x, s_x, reward:
            (j + 1, *self.propagate(m_x, s_x),
             tf.add(reward,
                    self.reward.compute_reward(m_x, s_x)[0])),
            loop_vars)

        return m_x, s_x, reward

    def propagate(self, m_x, s_x):
        m_u, s_u, c_xu = self.controller.compute_action(m_x, s_x)

        m = tf.concat([m_x, m_u], axis=1)
        s1 = tf.concat([s_x, s_x @ c_xu], axis=1)
        s2 = tf.concat([tf.transpose(s_x @ c_xu), s_u], axis=1)
        s = tf.concat([s1, s2], axis=0)

        M_dx, S_dx, C_dx = self.mgpr.predict_on_noisy_inputs(m, s)
        M_x = M_dx + m_x
        S_x = S_dx + s_x + s1 @ C_dx + tf.matmul(
            C_dx, s1, transpose_a=True, transpose_b=True)

        # While-loop requires the shapes of the outputs to be fixed
        M_x.set_shape([1, self.state_dim])
        S_x.set_shape([self.state_dim, self.state_dim])
        return M_x, S_x

    @gpflow.autoflow()
    def compute_reward(self):
        return self._build_likelihood()