class PILCO: def __init__(self, X, Y, horizon=30, m_init=None, S_init=None): self.mgpr = MGPR(X, Y) self.state_dim = Y.shape[1] self.control_dim = X.shape[1] - Y.shape[1] self.horizon = horizon self.controller = LinearController(self.state_dim, self.control_dim) self.reward = ExponentialReward(self.state_dim) if m_init is None or S_init is None: # default initial state for the rollouts is the first state in the dataset. self.m_init = X[0:1, 0:self.state_dim] self.S_init = np.diag(np.ones(self.state_dim) * 0.1) else: self.m_init = m_init self.S_init = S_init self.m_init = torch.tensor(self.m_init, dtype=torch.float32) self.S_init = torch.tensor(self.S_init, dtype=torch.float32) self.optimizer = torch.optim.Adam(self.controller.parameters()) def optimize_models(self, maxiter=200): ''' Optimize GP models ''' self.mgpr.optimize(max_iter=maxiter) self.mgpr.eval() # print learned dynamics model parameters lengthscales = {} variances = {} noises = {} i = 0 for model in self.mgpr.models: lengthscales[ 'GP' + str(i)] = model.covar_module.base_kernel.lengthscale.detach( ).numpy().ravel() variances['GP' + str(i)] = np.array( [model.covar_module.outputscale.item()]) noises['GP' + str(i)] = np.array([model.likelihood.noise.item()]) i += 1 print('-----Learned models------') pd.set_option('precision', 3) print('---Lengthscales---') print(pd.DataFrame(data=lengthscales)) print('---Variances---') print(pd.DataFrame(data=variances)) print('---Noises---') print(pd.DataFrame(data=noises)) def optimize_policy(self, maxiter=50): ''' Optimize controller's parameter's ''' self.mgpr.eval() start = time.time() for i in range(maxiter): self.optimizer.zero_grad() reward = self.compute_reward() # policy evaluation loss = -reward loss.backward() # policy improvement by policy gradient self.optimizer.step() end = time.time() print( "Controller's optimization: done in %.1f seconds with reward=%.3f." % (end - start, self.compute_reward())) def compute_action(self, x_m): x_m = torch.tensor(x_m, dtype=torch.float32) x_s = torch.zeros((self.state_dim, self.state_dim), dtype=torch.float32) return self.controller((x_m, x_s))[0].detach().numpy() def predict(self, m_x, s_x, n): ''' predict n steps with learned model ''' reward = 0 for _ in range(n): m_x, s_x = self.propagate(m_x, s_x) reward += self.reward.compute_reward(m_x, s_x) return m_x, s_x, reward def propagate(self, m_x, s_x): ''' propagate from one state distribution to the next one with controller and GP models ''' # from state x to control u m_u, s_u, c_xu = self.controller((m_x, s_x)) # joint distribution of x and u m = torch.cat([m_x, m_u], axis=1) s1 = torch.cat([s_x, s_x @ c_xu], axis=1) s2 = torch.cat([(s_x @ c_xu).T, s_u], axis=1) s = torch.cat([s1, s2], axis=0) # go to next state by moment matching M_dx, S_dx, C_dx = self.mgpr.predict_on_noisy_inputs(m, s) M_x = M_dx + m_x S_x = S_dx + s_x + s1 @ C_dx + C_dx.T @ s1.T M_x.reshape(1, self.state_dim) S_x.reshape(self.state_dim, self.state_dim) return M_x, S_x def compute_reward(self): reward = self.predict(self.m_init, self.S_init, self.horizon)[2] return reward
class PILCO(gpflow.models.Model): def __init__(self, X, Y, num_induced_points=None, horizon=100, controller=None, reward=None, m_init=None, S_init=None, name=None): super(PILCO, self).__init__(name) self.mgpr = MGPR(X, Y) self.state_dim = Y.shape[1] self.control_dim = X.shape[1] - Y.shape[1] self.horizon = horizon self.controller = controller self.reward = reward self.m_init = X[0:1, 0:self.state_dim] self.S_init = np.diag(np.ones(self.state_dim) * 0.1) self.optimizer = None @gpflow.name_scope('likelihood') def _build_likelihood(self): # This is for tuning controller's parameters reward = self.predict(self.m_init, self.S_init, self.horizon)[2] return reward def optimize_models(self, maxiter=200, restarts=1): ''' Optimize GP models ''' self.mgpr.optimize(restarts=restarts) # Print the resulting model parameters lengthscales = {} variances = {} noises = {} i = 0 for model in self.mgpr.models: lengthscales['GP' + str(i)] = model.kern.lengthscales.value variances['GP' + str(i)] = np.array([model.kern.variance.value]) noises['GP' + str(i)] = np.array([model.likelihood.variance.value]) i += 1 print('-----Learned models------') pd.set_option('precision', 3) print('---Lengthscales---') print(pd.DataFrame(data=lengthscales)) print('---Variances---') print(pd.DataFrame(data=variances)) print('---Noises---') print(pd.DataFrame(data=noises)) def optimize_policy(self, maxiter=30, restarts=0): ''' Optimize controller's parameter's ''' start = time.time() if not self.optimizer: self.optimizer = gpflow.train.ScipyOptimizer(method="L-BFGS-B") start = time.time() self.optimizer.minimize(self, maxiter=maxiter) end = time.time() print( "Controller's optimization: done in %.1f seconds with reward=%.3f." % (end - start, self.compute_reward())) session = self.optimizer._model.enquire_session(None) start = time.time() self.optimizer._optimizer.minimize( session=session, feed_dict=self.optimizer._gen_feed_dict(self.optimizer._model, None), step_callback=None) end = time.time() print( "Controller's optimization: done in %.1f seconds with reward=%.3f." % (end - start, self.compute_reward())) best_parameters = self.read_values(session=session) self.assign(best_parameters) @gpflow.autoflow((float_type, [None, None])) def compute_action(self, x_m): return self.controller.compute_action( x_m, tf.zeros([self.state_dim, self.state_dim], float_type))[0] def predict(self, m_x, s_x, n): loop_vars = [ tf.constant(0, tf.int32), m_x, s_x, tf.constant([[0]], float_type) ] _, m_x, s_x, reward = tf.while_loop( # Termination condition lambda j, m_x, s_x, reward: j < n, # Body function lambda j, m_x, s_x, reward: (j + 1, *self.propagate(m_x, s_x), tf.add(reward, self.reward.compute_reward(m_x, s_x)[0])), loop_vars) return m_x, s_x, reward def propagate(self, m_x, s_x): m_u, s_u, c_xu = self.controller.compute_action(m_x, s_x) m = tf.concat([m_x, m_u], axis=1) s1 = tf.concat([s_x, s_x @ c_xu], axis=1) s2 = tf.concat([tf.transpose(s_x @ c_xu), s_u], axis=1) s = tf.concat([s1, s2], axis=0) M_dx, S_dx, C_dx = self.mgpr.predict_on_noisy_inputs(m, s) M_x = M_dx + m_x S_x = S_dx + s_x + s1 @ C_dx + tf.matmul( C_dx, s1, transpose_a=True, transpose_b=True) # While-loop requires the shapes of the outputs to be fixed M_x.set_shape([1, self.state_dim]) S_x.set_shape([self.state_dim, self.state_dim]) return M_x, S_x @gpflow.autoflow() def compute_reward(self): return self._build_likelihood()