Beispiel #1
0
    def update(self, X, U, cs, **kwargs):
        """Perform DDPG update step."""
        M, N, T, _ = X.shape

        # Store samples in memory
        self.pol.store_transition(
            X[:, :, :-1].reshape(M * N * (T - 1), self.dX),
            U[:, :, :-1].reshape(M * N * (T - 1), self.dU),
            -cs[:, :, :-1].reshape(M * N * (T - 1)),
            X[:, :, 1:].reshape(M * N * (T - 1), self.dX),
            np.zeros((M * N * (T - 1))),
        )

        # Train DDPG
        losses = np.zeros((self.epochs, 2))
        pbar = tqdm(range(self.epochs))
        for epoch in pbar:
            if self.pol.memory.nb_entries >= self.pol.batch_size and epoch % self.param_noise_adaption_interval == 0:
                self.pol.adapt_param_noise()

            losses[epoch] = self.pol.train()
            self.pol.update_target_net()

            pbar.set_description("Loss: %.6f/%.6f" %
                                 (losses[epoch, 0], losses[epoch, 1]))

        # Visualize training loss
        from gps.visualization import visualize_loss
        visualize_loss(
            self._data_files_dir + 'plot_gps_training-%02d' %
            (self.iteration_count),
            losses,
            labels=['critic', 'actor'],
        )
Beispiel #2
0
    def update(self, X, mu, prc, initial_policy=False, **kwargs):
        """Trains the GPS model on the dataset."""
        M, N, T = X.shape[:3]
        N_train = M * N * (T - 1)

        # Reshape inputs.
        X = X[:, :, :-1].reshape((N_train, self.dX))
        mu = mu[:, :, :-1].reshape((N_train, self.dU))
        prc = np.reshape(np.repeat(prc[:, None, :-1], N, axis=1),
                         (N_train, self.dU, self.dU))

        # Normalize precision
        prc = prc * (self.dU / np.mean(np.trace(prc, axis1=-2, axis2=-1)))

        # Reset optimizer
        self.sess.run(self.optimizer_reset_op)

        # Initialize dataset iterator
        self.sess.run(self.iterator.initializer,
                      feed_dict={
                          self.state_data: X,
                          self.action_data: mu,
                          self.precision_data: prc
                      })

        batches_per_epoch = int(N_train / self.batch_size)
        assert batches_per_epoch * self.batch_size == N_train, \
            '%d * %d != %d' % (batches_per_epoch, self.batch_size, N_train)
        epochs = self.epochs if not initial_policy else 10
        losses = np.zeros((epochs, 2))
        pbar = tqdm(range(epochs))
        for epoch in pbar:
            for i in range(batches_per_epoch):
                losses[epoch] += self.sess.run([
                    self.solver_op,
                    self.loss_kl,
                    self.loss_reg,
                ],
                                               feed_dict={
                                                   self.is_training: True,
                                               })[1:]
            losses[epoch] /= batches_per_epoch
            pbar.set_description("GPS Loss: %.6f" % (np.sum(losses[epoch])))

        # Visualize training loss
        from gps.visualization import visualize_loss
        visualize_loss(self._data_files_dir + 'plot_gps_training-%02d' %
                       (self.iteration_count),
                       losses,
                       labels=['KL divergence', 'L2 reg'])

        # Optimize variance.
        A = (np.sum(prc, 0) +
             2 * N * T * self._hyperparams['ent_reg'] * np.ones(
                 (self.dU, self.dU))) / N_train

        self.var = 1 / np.diag(A)
        self.policy.chol_pol_covar = np.diag(np.sqrt(self.var))
Beispiel #3
0
    def update(self, X, mu, prc, _):
        """
        Trains a GPS model on the dataset
        """
        N, T, _ = X.shape

        # Reshape inputs.
        X = X.reshape((N * T, self.dX))
        mu = mu.reshape((N * T, self.dU))
        prc = prc.reshape((N * T, self.dU, self.dU))

        # Normalize X, but only compute normalization at the beginning.
        if self.scaler is None:
            self.scaler = StandardScaler().fit(X)
        X = self.scaler.transform(X)

        # Create dataset
        with self.graph.as_default():
            dataset = tf.data.Dataset.from_tensor_slices(
                (X, mu, prc)).shuffle(N).batch(self.batch_size)
            iterator = dataset.make_initializable_iterator()
            next_element = iterator.get_next()

        # Reset optimizer
        self.sess.run(self.optimizer_reset_op)

        batches_per_epoch = int(N * T / self.batch_size)
        assert batches_per_epoch * self.batch_size == N * T, 'N=%d, batchsize=%d, batches_per_epoch=%d' % (
            N * T, self.batch_size, batches_per_epoch)

        losses = np.zeros((self.epochs, 2))
        pbar = tqdm(range(self.epochs))
        for epoch in pbar:
            # Initialize dataset iterator
            self.sess.run(iterator.initializer)

            for i in range(batches_per_epoch):
                batch_X, batch_mu, batch_prc = self.sess.run(next_element)

                losses[epoch] += self.sess.run(
                    [self.solver_op, self.loss_kl, self.loss_reg],
                    feed_dict={
                        self.state_in: batch_X,
                        self.action_in: batch_mu,
                        self.precision_in: batch_prc
                    })[1:]
            losses[epoch] /= batches_per_epoch
            pbar.set_description("GPS Loss: {:.6f}".format(
                np.sum(losses[epoch])))

        # Visualize training loss
        from gps.visualization import visualize_loss
        visualize_loss(self._data_files_dir + 'plot_gps_training-%02d' %
                       (self.iteration_count),
                       losses,
                       labels=['KL divergence', 'L2 reg'])

        # Optimize variance.
        A = np.mean(
            prc, axis=0) + 2 * N * T * self._hyperparams['ent_reg'] * np.ones(
                (self.dU, self.dU))

        self.var = 1 / np.diag(A)
        self.policy.chol_pol_covar = np.diag(np.sqrt(self.var))
Beispiel #4
0
    def update(self, X, mu, prc, K, k, initial_policy=False, **kwargs):
        """Trains the MU model on the dataset."""
        M, N, T = X.shape[:3]
        N_ctr = M * (T - 1)

        X = X[:, :, :-1].transpose((0, 2, 1, 3)).reshape(N_ctr, N, self.dX)
        K = K[:, :-1].reshape(N_ctr, self.dU, self.dX)
        k = k[:, :-1].reshape(N_ctr, self.dU)
        prc = prc[:, :-1].reshape(N_ctr, self.dU, self.dU)

        # Standardize K
        self.K_scaler = StandardScaler().fit(K.reshape(N_ctr, self.dU * self.dX))

        # Normalize precision
        prc = prc * (10 * self.dU / np.mean(np.trace(prc, axis1=-2, axis2=-1)))

        # Reset optimizer
        self.sess.run(self.optimizer_reset_op)

        # Initialize dataset iterator
        self.sess.run(
            self.iterator.initializer,
            feed_dict={
                self.state_data: X,
                self.K_data: K,
                self.k_data: k,
                self.precision_data: prc,
            }
        )

        batches_per_epoch = int(N_ctr / self.batch_size)
        assert batches_per_epoch * self.batch_size == N_ctr, (
            '%d * %d != %d' % (batches_per_epoch, self.batch_size, N_ctr)
        )
        epochs = self.epochs if not initial_policy else 10
        losses = np.zeros((epochs, 3))
        pbar = tqdm(range(epochs))
        for epoch in pbar:
            for i in range(batches_per_epoch):
                losses[epoch] += self.sess.run(
                    [
                        self.solver_op,
                        self.loss_action,
                        self.loss_stabilizer,
                        self.loss_latent,
                    ],
                    feed_dict={
                        self.is_training: True,
                        self.K_scale: self.K_scaler.scale_.reshape(self.dU, self.dX),
                        self.K_center: self.K_scaler.mean_.reshape(self.dU, self.dX),
                    }
                )[1:]
            losses[epoch] /= batches_per_epoch
            pbar.set_description("Loss: %.6f/%.6f/%.6f" % (losses[epoch, 0], losses[epoch, 1], losses[epoch, 2]))

        # Visualize training loss
        from gps.visualization import visualize_loss
        visualize_loss(
            self._data_files_dir + 'plot_gps_training-%02d' % (self.iteration_count),
            losses,
            labels=['Action Estimator', 'Stabilizer', 'Latent']
        )
        self.sample_latent_space(X, N_test=50)

        # Optimize variance.
        A = np.mean(prc, axis=0) + 2 * N * T * self._hyperparams['ent_reg'] * np.ones((self.dU, self.dU))

        self.var = 1 / np.diag(A)
        self.policy.chol_pol_covar = np.diag(np.sqrt(self.var))