Exemple #1
0
    def train_policy(self, dataset):
        """
        trains the model-based policy
        """
        timeit.start('train policy')

        losses = []
        for _ in range(self.training_epochs):
            loss_total = 0.0
            num_data = 0

            d = dataset.random_iterator(self.training_batch_size)
            for states, actions, next_states, _, _ in d:
                loss = self.policy.train_step(states, actions, next_states)
                loss_total += loss
                num_data += 1

            losses.append(loss / num_data)
        # plt.plot(losses)
        # plt.show()
        logger.record_tabular('TrainingLossStart', losses[0])
        logger.record_tabular('TrainingLossFinal', losses[-1])

        timeit.stop('train policy')
        return
    def _train_policy(self, dataset):
        """
        Train the model-based policy

        implementation details:
            (a) Train for self._training_epochs number of epochs
            (b) The dataset.random_iterator(...)  method will iterate through the dataset once in a random order
            (c) Use self._training_batch_size for iterating through the dataset
            (d) Keep track of the loss values by appending them to the losses array
        """
        timeit.start('train policy')

        losses = []
        ### PROBLEM 1
        ### YOUR CODE HERE
        for _ in range(self._training_epochs):
            for states, actions, next_states, _, _ in dataset.random_iterator(self._training_batch_size):
                # import sys; print(sys._getframe().f_code.co_name,sys._getframe().f_lineno)
                # from IPython import embed; embed()
                losses.append(self._policy.train_step(states, actions, next_states))

        logger.record_tabular('TrainingLossStart', losses[0])
        logger.record_tabular('TrainingLossFinal', losses[-1])

        timeit.stop('train policy')
Exemple #3
0
    def _train_policy(self, dataset):
        """
        Train the model-based policy

        implementation details:
            (a) Train for self._training_epochs number of epochs
            (b) The dataset.random_iterator(...)  method will iterate through the dataset once in a random order
            (c) Use self._training_batch_size for iterating through the dataset
            (d) Keep track of the loss values by appending them to the losses array
        """
        timeit.start('train policy')

        losses = []
        ### PROBLEM 1
        ### YOUR CODE HERE
        # raise NotImplementedError
        for ep in range(self._training_epochs):
            data_generator = dataset.random_iterator(self._training_batch_size)
            for i, batch_x in enumerate(data_generator):
                states, actions, next_states = batch_x[:3]
                loss = self._policy.train_step(states, actions, next_states)
                losses.append(loss)
                # logger.debug('%s/Epoch, it %s: Loss: %s' % (ep, i, loss))

        logger.info('loss start to end: %s, %s' % (losses[0], losses[-1]))
        logger.record_tabular('TrainingLossStart', losses[0])
        logger.record_tabular('TrainingLossFinal', losses[-1])

        timeit.stop('train policy')
    def _train_policy(self, dataset):
        """
        Train the model-based policy

        implementation details:
            (a) Train for self._training_epochs number of epochs
            (b) The dataset.random_iterator(...)  method will iterate through the dataset once in a random order
            (c) Use self._training_batch_size for iterating through the dataset
            (d) Keep track of the loss values by appending them to the losses array
        """
        timeit.start('train policy')

        losses = []
        # Added: Training policy iteration
        for epoch_num in range(self._training_epochs):
            logger.info('Epoch %i' % (epoch_num + 1))
            for batch_num, (states, actions, next_states, _, _) in enumerate(
                    dataset.random_iterator(self._training_batch_size)):
                loss = self._policy.train_step(states, actions, next_states)
                losses.append(loss)
            logger.info('\tLoss: {:.3f}'.format(losses[-1]))

        logger.record_tabular('TrainingLossStart', losses[0])
        logger.record_tabular('TrainingLossFinal', losses[-1])

        timeit.stop('train policy')
Exemple #5
0
    def _train_policy(self, dataset):
        """
        Train the model-based policy

        implementation details:
            (a) Train for self._training_epochs number of epochs
            (b) The dataset.random_iterator(...)  method will iterate through the dataset once in a random order
            (c) Use self._training_batch_size for iterating through the dataset
            (d) Keep track of the loss values by appending them to the losses array
        """
        timeit.start('train policy')

        losses = []
        ### PROBLEM 1
        ### YOUR CODE HERE

        for epoch in range(self._training_epochs):
            for state, action, next_state, _, _ in dataset.random_iterator(
                    self._training_batch_size):
                loss = self._policy.train_step(states=state,
                                               actions=action,
                                               next_states=next_state)
                losses.append(loss)

        # raise NotImplementedError
        logger.record_tabular('TrainingLossStart', losses[0])
        logger.record_tabular('TrainingLossFinal', losses[-1])

        timeit.stop('train policy')
    def _train_policy(self, dataset):
        """
        Train the model-based policy

        implementation details:
            (a) Train for self._training_epochs number of epochs
            (b) The dataset.random_iterator(...)  method will iterate through the dataset once in a random order
            (c) Use self._training_batch_size for iterating through the dataset
            (d) Keep track of the loss values by appending them to the losses array
        """
        timeit.start('train policy')

        losses = []
        ### PROBLEM 1
        ### YOUR CODE HERE
        # (a) Train for self._training_epochs number of epochs
        for _ in range(self._training_epochs):
            # (b) The dataset.random_iterator(...)  method will iterate through the dataset once in a random order
            # (c) Use self._training_batch_size for iterating through the dataset
            epoch_losses = []

            for _, (states, actions, next_states, _, _) in enumerate(
                    dataset.random_iterator(self._training_batch_size)):
                loss = self._policy.train_step(states, actions, next_states)

                epoch_losses.append(loss)
            # (d) Keep track of the loss values by appending them to the losses array
            losses.append(np.mean(epoch_losses))

        logger.record_tabular('TrainingLossStart', losses[0])
        logger.record_tabular('TrainingLossFinal', losses[-1])

        timeit.stop('train policy')
    def __init__(self,
                 env,
                 num_init_random_rollouts=10,
                 max_rollout_length=500,
                 num_onplicy_iters=10,
                 num_onpolicy_rollouts=10,
                 training_epochs=60,
                 training_batch_size=512,
                 render=False,
                 mpc_horizon=15,
                 num_random_action_selection=4096,
                 nn_layers=1):
        self._env = env
        self._max_rollout_length = max_rollout_length
        self._num_onpolicy_iters = num_onplicy_iters
        self._num_onpolicy_rollouts = num_onpolicy_rollouts
        self._training_epochs = training_epochs
        self._training_batch_size = training_batch_size
        self._render = render

        logger.info('Gathering random dataset')
        self._random_dataset = self._gather_rollouts(utils.RandomPolicy(env),
                                                     num_init_random_rollouts)

        logger.info('Creating policy')
        self._policy = ModelBasedPolicy(
            env,
            self._random_dataset,
            horizon=mpc_horizon,
            num_random_action_selection=num_random_action_selection)

        timeit.reset()
        timeit.start('total')
    def _train_policy(self, dataset):
        """
        Train the model-based policy

        implementation details:
            (a) Train for self._training_epochs number of epochs
            (b) The dataset.random_iterator(...)  method will iterate through the dataset once in a random order
            (c) Use self._training_batch_size for iterating through the dataset
            (d) Keep track of the loss values by appending them to the losses array
        """
        timeit.start('train policy')

        losses = []
        ### PROBLEM 1
        ### YOUR CODE HERE
        for epoch in range(self._training_epochs):
            for states, actions, next_states, _, _ in dataset.random_iterator(
                    self._training_batch_size):
                loss = self._policy.train_step(states, actions, next_states)
                losses.append(loss)
            # self._random_dataset_test = self._gather_rollouts(self._policy, 2)
            # for states, actions, next_states, _, _ in self._random_dataset_test.random_iterator(len(self._random_dataset_test)):
            #     eval_loss = self._policy.eval_loss(states, actions, next_states)
            #     print("Test loss: " + str(eval_loss))

        logger.record_tabular('TrainingLossStart', losses[0])
        logger.record_tabular('TrainingLossFinal', losses[-1])

        timeit.stop('train policy')
    def _train_policy(self, dataset):
        """
        Train the model-based policy

        implementation details:
            (a) Train for self._training_epochs number of epochs
            (b) The dataset.random_iterator(...)  method will iterate through the dataset once in a random order
            (c) Use self._training_batch_size for iterating through the dataset
            (d) Keep track of the loss values by appending them to the losses array
        """
        timeit.start('train policy')

        losses = []
        ### PROBLEM 1
        ### YOUR CODE HERE
        # raise NotImplementedError
        for _ in range(self._training_epochs):
            current_batches = dataset.random_iterator(self._training_batch_size)
            while True:
                state, action, next_state, _, _ = \
                    next(current_batches, [None] * 5)
                if state is None:
                    break
                loss = self._policy.train_step(state, action, next_state)
                losses.append(loss)

        logger.record_tabular('TrainingLossStart', losses[0])
        logger.record_tabular('TrainingLossFinal', losses[-1])

        timeit.stop('train policy')

        plt.figure()
        plt.plot(losses)
        plt.savefig(os.path.join(logger.dir, 'training.png'))
    def _train_policy(self, dataset):
        """
        Train the model-based policy

        """
        timeit.start('train policy')

        losses = []
        ### PROBLEM 1
        ### YOUR CODE HERE
        # (a) Train for self._training_epochs number of epochs
        for ep in range(self._training_epochs):
            # (b) The dataset.random_iterator(...)  method will iterate through the dataset once in a random order,
            #       it will return a dataset with random sequence
            # (c) Use self._training_batch_size for iterating through the dataset
            _iter = dataset.random_iterator(self._training_batch_size)
            for states, actions, next_states, _, _ in _iter:
                loss = self._policy.train_step(states, actions, next_states)
                # dataset_size = dones[0]
                # batch_start_index = np.array(0,dataset_size, self._training_batch_size)
                # batch_end_index = batch_start_index + self._training_batch_size

                # for (start, end) in zip(batch_start_index,batch_end_index):
                #     loss = self._policy.train_step(states[start:end], actions[start:end], next_states[start:end]
                losses.append(loss)

        logger.record_tabular('TrainingLossStart', losses[0])
        logger.record_tabular('TrainingLossFinal', losses[-1])

        timeit.stop('train policy')
 def _log(self, dataset):
     timeit.stop('total')
     dataset.log()
     logger.dump_tabular(print_func=logger.info)
     logger.debug('')
     for line in str(timeit).split('\n'):
         logger.debug(line)
     timeit.reset()
     timeit.start('total')
Exemple #12
0
    def _log(self, dataset):
        # stop timing
        timeit.stop('total')

        # print logging information
        dataset.log()
        logger.dump_tabular(print_func=logger.info)
        logger.debug('')
        for line in str(timeit).split('\n'):
            logger.debug(line)

        # reset timing
        timeit.reset()
        timeit.start('total')
    def _debug_rollout_and_record(self, policy, num_rollouts):
        dataset = utils.Dataset()

        for r_num in range(num_rollouts):
            state = self._env.reset()
            done = False
            t = 0

            states = [state]
            pred_states = [state]
            while not done:
                if self._render:
                    timeit.start('render')
                    self._env.render()
                    timeit.stop('render')
                timeit.start('get action')
                action, next_state_pred = policy.get_action(state, True)
                timeit.stop('get action')
                timeit.start('env step')
                next_state, reward, done, _ = self._env.step(action)
                timeit.stop('env step')
                done = done or (t >= self._max_rollout_length)
                dataset.add(state, action, next_state, reward, done)

                state = next_state
                t += 1

                pred_states.append(next_state_pred)
                states.append(next_state)

            states = np.array(states)
            pred_states = np.array(pred_states)
            self._debug_plot_states(states, pred_states, r_num)

        return dataset
    def _gather_rollouts(self, policy, num_rollouts):
        dataset = utils.Dataset()

        for _ in range(num_rollouts):
            state = self._env.reset()
            done = False
            t = 0
            while not done:
                if self._render:
                    timeit.start('render')
                    self._env.render()
                    timeit.stop('render')
                timeit.start('get action')
                action = policy.get_action(state)
                timeit.stop('get action')
                timeit.start('env step')
                next_state, reward, done, _ = self._env.step(action)
                timeit.stop('env step')
                done = done or (t >= self._max_rollout_length)
                dataset.add(state, action, next_state, reward, done)

                state = next_state
                t += 1

        return dataset
    def _gather_rollouts(self, policy, num_rollouts):
        # Initialize to have an empty dataset
        dataset = utils.Dataset()

        for i in range(num_rollouts):
            state = self._env.reset()
            done = False
            t = 0
            while not done:
                if self._render:
                    timeit.start('render')
                    self._env.render()
                    timeit.stop('render')
                timeit.start('get action')
                action = policy.get_action(state)
                timeit.stop('get action')
                timeit.start('env step')
                # Therefore, here env has the real dynamics
                next_state, reward, done, _ = self._env.step(action)
                timeit.stop('env step')
                # max_rollout_length = 500, therefore there are typically 501 data points
                done = done or (t >= self._max_rollout_length)
                dataset.add(state, action, next_state, reward, done)

                state = next_state
                t += 1
                if t % 100 == 0:
                    print('time step', t)
            print('rollout', i)

        return dataset
Exemple #16
0
    def _train_policy(self, dataset):

        # timing for policy training
        timeit.start('train policy')

        losses = []

        # loop for self._training_epochs
        for _ in range(self._training_epochs):

            # iterate over dataset
            for states, actions, next_states, _, _ in \
                    dataset.random_iterator(self._training_batch_size):

                # compute loss
                loss = self._policy.train_step(states, actions, next_states)
                losses.append(loss)

        # perform logging
        logger.record_tabular('TrainingLossStart', losses[0])
        logger.record_tabular('TrainingLossFinal', losses[-1])
        timeit.stop('train policy')
    def _train_policy(self, dataset):
        """
        Train the model-based policy

        implementation details:
            (a) Train for self._training_epochs number of epochs
            (b) The dataset.random_iterator(...)  method will iterate through the dataset once in a random order
            (c) Use self._training_batch_size for iterating through the dataset
            (d) Keep track of the loss values by appending them to the losses array
        """
        timeit.start('train policy')

        ### PROBLEM 1
        ### YOUR CODE HERE
        # raise NotImplementedError
        print('batch size is', self._training_batch_size)
        print('epoch size is', self._training_epochs)
        # Iterate dataset once in an epoch
        losses = []
        for epoch in range(self._training_epochs):
            t_loss = 0

            # Alternative way:
            # random_data = dataset.random_iterator(self._training_batch_size)
            # for states, actions, next_states, _, _ in random_data:
            # Enumerate is to add index!
            for r_num, (states, actions, next_states, _, _) in enumerate(
                    dataset.random_iterator(self._training_batch_size)):
                loss = self._policy.train_step(states, actions, next_states)
                t_loss += loss
            t_loss = t_loss / (r_num + 1)
            losses.append(t_loss)

        # TO-DO: why not print out in q1?
        logger.record_tabular('TrainingLossStart', losses[0])
        logger.record_tabular('TrainingLossFinal', losses[-1])

        timeit.stop('train policy')
Exemple #18
0
    def _gather_rollouts(self, policy, num_rollouts):

        # initialize dataset class
        dataset = utils.Dataset()

        # loop for num_rollouts
        for _ in range(num_rollouts):

            # reset gym env
            t = 0
            done = False
            state = self._env.reset()

            # generate gym rollout
            while not done:
                # perform rendering
                if self._render:
                    timeit.start('render')
                    self._env.render()
                    timeit.stop('render')

                # get action using MPC
                timeit.start('get action')
                action = policy.get_action(state)
                timeit.stop('get action')

                # step through environment
                timeit.start('env step')
                next_state, reward, done, _ = self._env.step(action)
                timeit.stop('env step')

                # add experience to dataset
                done = done or (t >= self._max_rollout_length)
                dataset.add(state, action, next_state, reward, done)

                # update state variable
                t += 1
                state = next_state

        return dataset