コード例 #1
0
    def __init__(self,
                 env,
                 num_init_random_rollouts=10,
                 max_rollout_length=500,
                 num_onplicy_iters=10,
                 num_onpolicy_rollouts=10,
                 training_epochs=60,
                 training_batch_size=512,
                 render=False,
                 mpc_horizon=15,
                 num_random_action_selection=4096,
                 nn_layers=1):
        self._env = env
        self._max_rollout_length = max_rollout_length
        self._num_onpolicy_iters = num_onplicy_iters
        self._num_onpolicy_rollouts = num_onpolicy_rollouts
        self._training_epochs = training_epochs
        self._training_batch_size = training_batch_size
        self._render = render

        logger.info('Gathering random dataset')
        self._random_dataset = self._gather_rollouts(utils.RandomPolicy(env),
                                                     num_init_random_rollouts)

        logger.info('Creating policy')
        self._policy = ModelBasedPolicy(
            env,
            self._random_dataset,
            horizon=mpc_horizon,
            num_random_action_selection=num_random_action_selection)

        timeit.reset()
        timeit.start('total')
コード例 #2
0
class ModelBasedRL(object):
    def __init__(self,
                 env,
                 num_init_random_rollouts=10,
                 max_rollout_length=500,
                 num_onplicy_iters=10,
                 num_onpolicy_rollouts=10,
                 training_epochs=60,
                 training_batch_size=512,
                 render=False,
                 mpc_horizon=15,
                 num_random_action_selection=4096,
                 nn_layers=1):
        self._env = env
        self._max_rollout_length = max_rollout_length
        self._num_onpolicy_iters = num_onplicy_iters
        self._num_onpolicy_rollouts = num_onpolicy_rollouts
        self._training_epochs = training_epochs
        self._training_batch_size = training_batch_size
        self._render = render

        logger.info('Gathering random dataset')
        self._random_dataset = self._gather_rollouts(utils.RandomPolicy(env),
                                                     num_init_random_rollouts)

        logger.info('Creating policy')
        self._policy = ModelBasedPolicy(
            env,
            self._random_dataset,
            horizon=mpc_horizon,
            num_random_action_selection=num_random_action_selection)

        timeit.reset()
        timeit.start('total')

    def _gather_rollouts(self, policy, num_rollouts):
        dataset = utils.Dataset()

        for _ in range(num_rollouts):
            state = self._env.reset()
            done = False
            t = 0
            while not done:
                if self._render:
                    timeit.start('render')
                    self._env.render()
                    timeit.stop('render')
                timeit.start('get action')
                action = policy.get_action(state)
                timeit.stop('get action')
                timeit.start('env step')
                next_state, reward, done, _ = self._env.step(action)
                timeit.stop('env step')
                done = done or (t >= self._max_rollout_length)
                dataset.add(state, action, next_state, reward, done)

                state = next_state
                t += 1

        return dataset

    def _train_policy(self, dataset):
        """
        Train the model-based policy

        implementation details:
            (a) Train for self._training_epochs number of epochs
            (b) The dataset.random_iterator(...)  method will iterate through the dataset once in a random order
            (c) Use self._training_batch_size for iterating through the dataset
            (d) Keep track of the loss values by appending them to the losses array
        """
        timeit.start('train policy')

        losses = []
        ### PROBLEM 1
        ### YOUR CODE HERE
        # (a) Train for self._training_epochs number of epochs
        for _ in range(self._training_epochs):
            # (b) The dataset.random_iterator(...)  method will iterate through the dataset once in a random order
            # (c) Use self._training_batch_size for iterating through the dataset
            epoch_losses = []

            for _, (states, actions, next_states, _, _) in enumerate(
                    dataset.random_iterator(self._training_batch_size)):
                loss = self._policy.train_step(states, actions, next_states)

                epoch_losses.append(loss)
            # (d) Keep track of the loss values by appending them to the losses array
            losses.append(np.mean(epoch_losses))

        logger.record_tabular('TrainingLossStart', losses[0])
        logger.record_tabular('TrainingLossFinal', losses[-1])

        timeit.stop('train policy')

    def _log(self, dataset):
        timeit.stop('total')
        dataset.log()
        logger.dump_tabular(print_func=logger.info)
        logger.debug('')
        for line in str(timeit).split('\n'):
            logger.debug(line)
        timeit.reset()
        timeit.start('total')

    def run_q1(self):
        """
        Train on a dataset, and see how good the learned dynamics model's predictions are.

        implementation details:
            (i) Train using the self._random_dataset
            (ii) For each rollout, use the initial state and all actions to predict the future states.
                 Store these predicted states in the pred_states list.
                 NOTE: you should *not* be using any of the states in states[1:]. Only use states[0]
            (iii) After predicting the future states, we have provided plotting code that plots the actual vs
                  predicted states and saves these to the experiment's folder. You do not need to modify this code.
        """
        logger.info('Training policy....')
        ### PROBLEM 1
        ### YOUR CODE HERE
        # (i) Train using the self._random_dataset
        self._train_policy(self._random_dataset)

        logger.info('Evaluating predictions...')
        for r_num, (states, actions, _, _,
                    _) in enumerate(self._random_dataset.rollout_iterator()):
            pred_states = []

            ### PROBLEM 1
            ### YOUR CODE HERE
            pred_state = states[0]

            for i in range(len(actions)):
                pred_states.append(pred_state)
                pred_state = self._policy.predict(pred_state, actions[i])

            states = np.asarray(states)
            pred_states = np.asarray(pred_states)

            state_dim = states.shape[1]
            rows = int(np.sqrt(state_dim))
            cols = state_dim // rows
            f, axes = plt.subplots(rows, cols, figsize=(3 * cols, 3 * rows))
            f.suptitle(
                'Model predictions (red) versus ground truth (black) for open-loop predictions'
            )
            for i, (ax, state_i, pred_state_i) in enumerate(
                    zip(axes.ravel(), states.T, pred_states.T)):
                ax.set_title('state {0}'.format(i))
                ax.plot(state_i, color='k')
                ax.plot(pred_state_i, color='r')
            plt.tight_layout()
            plt.subplots_adjust(top=0.90)
            f.savefig(os.path.join(logger.dir,
                                   'prediction_{0:03d}.jpg'.format(r_num)),
                      bbox_inches='tight')

        logger.info('All plots saved to folder')

    def run_q2(self):
        """
        Train the model-based policy on a random dataset, and evaluate the performance of the resulting policy
        """
        logger.info('Random policy')
        self._log(self._random_dataset)

        logger.info('Training policy....')
        ### PROBLEM 2
        ### YOUR CODE HERE
        # Train the model-based policy on a random dataset...
        self._train_policy(self._random_dataset)

        logger.info('Evaluating policy...')
        ### PROBLEM 2
        ### YOUR CODE HERE
        # ...and evaluate the performance of the resulting policy
        eval_dataset = self._gather_rollouts(self._policy, 10)

        logger.info('Trained policy')
        self._log(eval_dataset)

    def run_q3(self):
        """
        Starting with the random dataset, train the policy on the dataset, gather rollouts with the policy,
        append the new rollouts to the existing dataset, and repeat
        """
        dataset = self._random_dataset

        itr = -1
        logger.info('Iteration {0}'.format(itr))
        logger.record_tabular('Itr', itr)
        self._log(dataset)

        for itr in range(self._num_onpolicy_iters + 1):
            logger.info('Iteration {0}'.format(itr))
            logger.record_tabular('Itr', itr)

            ### PROBLEM 3
            ### YOUR CODE HERE
            logger.info('Training policy...')
            self._train_policy(dataset)

            ### PROBLEM 3
            ### YOUR CODE HERE
            logger.info('Gathering rollouts...')
            new_dataset = self._gather_rollouts(self._policy,
                                                self._num_onpolicy_rollouts)

            ### PROBLEM 3
            ### YOUR CODE HERE
            logger.info('Appending dataset...')
            dataset.append(new_dataset)

            self._log(new_dataset)
コード例 #3
0
ファイル: model_based_rl.py プロジェクト: buntyke/homework
class ModelBasedRL(object):
    def __init__(self,
                 env,
                 num_init_random_rollouts=10,
                 max_rollout_length=500,
                 num_onplicy_iters=10,
                 num_onpolicy_rollouts=10,
                 training_epochs=60,
                 training_batch_size=512,
                 render=False,
                 mpc_horizon=15,
                 num_random_action_selection=4096,
                 nn_layers=1):

        # initialize class parameters
        self._env = env
        self._render = render
        self._training_epochs = training_epochs
        self._num_onpolicy_iters = num_onplicy_iters
        self._max_rollout_length = max_rollout_length
        self._training_batch_size = training_batch_size
        self._num_onpolicy_rollouts = num_onpolicy_rollouts

        # create initial random dataset
        logger.info('Gathering random dataset')
        self._random_dataset = self._gather_rollouts(utils.RandomPolicy(env),
                                                     num_init_random_rollouts)

        # create policy instance
        logger.info('Creating policy')
        self._policy = ModelBasedPolicy(
            env,
            self._random_dataset,
            horizon=mpc_horizon,
            num_random_action_selection=num_random_action_selection)

        # start timing
        timeit.reset()
        timeit.start('total')

    def _gather_rollouts(self, policy, num_rollouts):

        # initialize dataset class
        dataset = utils.Dataset()

        # loop for num_rollouts
        for _ in range(num_rollouts):

            # reset gym env
            t = 0
            done = False
            state = self._env.reset()

            # generate gym rollout
            while not done:
                # perform rendering
                if self._render:
                    timeit.start('render')
                    self._env.render()
                    timeit.stop('render')

                # get action using MPC
                timeit.start('get action')
                action = policy.get_action(state)
                timeit.stop('get action')

                # step through environment
                timeit.start('env step')
                next_state, reward, done, _ = self._env.step(action)
                timeit.stop('env step')

                # add experience to dataset
                done = done or (t >= self._max_rollout_length)
                dataset.add(state, action, next_state, reward, done)

                # update state variable
                t += 1
                state = next_state

        return dataset

    def _train_policy(self, dataset):

        # timing for policy training
        timeit.start('train policy')

        losses = []

        # loop for self._training_epochs
        for _ in range(self._training_epochs):

            # iterate over dataset
            for states, actions, next_states, _, _ in \
                    dataset.random_iterator(self._training_batch_size):

                # compute loss
                loss = self._policy.train_step(states, actions, next_states)
                losses.append(loss)

        # perform logging
        logger.record_tabular('TrainingLossStart', losses[0])
        logger.record_tabular('TrainingLossFinal', losses[-1])
        timeit.stop('train policy')

    def _log(self, dataset):
        # stop timing
        timeit.stop('total')

        # print logging information
        dataset.log()
        logger.dump_tabular(print_func=logger.info)
        logger.debug('')
        for line in str(timeit).split('\n'):
            logger.debug(line)

        # reset timing
        timeit.reset()
        timeit.start('total')

    def model_eval(self):

        logger.info('Training policy....')
        self._train_policy(self._random_dataset)

        logger.info('Evaluating predictions...')
        for r_num, (states, actions, _, _, _) in \
            enumerate(self._random_dataset.rollout_iterator()):

            pred_states = []
            for state, action in zip(states, actions):
                pred_states.append(self._policy.predict(state, action))

            states = np.asarray(states)
            pred_states = np.asarray(pred_states)

            state_dim = min(states.shape[1], 20)
            rows = int(np.sqrt(state_dim))
            cols = state_dim // rows

            f, axes = plt.subplots(rows, cols, figsize=(3 * cols, 3 * rows))
            f.suptitle('Model predictions (red) versus ground truth (black)')

            for i, (ax, state_i, pred_state_i) in enumerate(zip(axes.ravel(), \
                    states[:,:state_dim].T, pred_states[:,:state_dim].T)):

                ax.set_title('state {0}'.format(i))
                ax.plot(state_i, color='k')
                ax.plot(pred_state_i, color='r')
            plt.tight_layout()
            plt.subplots_adjust(top=0.90)
            f.savefig(os.path.join(logger.dir,
                                   'prediction_{0:03d}.jpg'.format(r_num)),
                      bbox_inches='tight')

        logger.info('All plots saved to folder')

    def train(self):
        dataset = self._random_dataset

        itr = -1
        logger.info('Iteration {0}'.format(itr))
        logger.record_tabular('Itr', itr)
        self._log(dataset)

        for itr in range(self._num_onpolicy_iters + 1):
            logger.info('Iteration {0}'.format(itr))
            logger.record_tabular('Itr', itr)

            logger.info('Training policy...')
            self._train_policy(dataset)

            logger.info('Gathering rollouts...')
            new_dataset = self._gather_rollouts(self._policy,
                                                self._num_onpolicy_rollouts)

            logger.info('Appending dataset...')
            dataset.append(new_dataset)

            self._log(new_dataset)