def train_policy(self, dataset): """ trains the model-based policy """ timeit.start('train policy') losses = [] for _ in range(self.training_epochs): loss_total = 0.0 num_data = 0 d = dataset.random_iterator(self.training_batch_size) for states, actions, next_states, _, _ in d: loss = self.policy.train_step(states, actions, next_states) loss_total += loss num_data += 1 losses.append(loss / num_data) # plt.plot(losses) # plt.show() logger.record_tabular('TrainingLossStart', losses[0]) logger.record_tabular('TrainingLossFinal', losses[-1]) timeit.stop('train policy') return
def _train_policy(self, dataset): """ Train the model-based policy implementation details: (a) Train for self._training_epochs number of epochs (b) The dataset.random_iterator(...) method will iterate through the dataset once in a random order (c) Use self._training_batch_size for iterating through the dataset (d) Keep track of the loss values by appending them to the losses array """ timeit.start('train policy') losses = [] ### PROBLEM 1 ### YOUR CODE HERE for _ in range(self._training_epochs): for states, actions, next_states, _, _ in dataset.random_iterator(self._training_batch_size): # import sys; print(sys._getframe().f_code.co_name,sys._getframe().f_lineno) # from IPython import embed; embed() losses.append(self._policy.train_step(states, actions, next_states)) logger.record_tabular('TrainingLossStart', losses[0]) logger.record_tabular('TrainingLossFinal', losses[-1]) timeit.stop('train policy')
def _train_policy(self, dataset): """ Train the model-based policy implementation details: (a) Train for self._training_epochs number of epochs (b) The dataset.random_iterator(...) method will iterate through the dataset once in a random order (c) Use self._training_batch_size for iterating through the dataset (d) Keep track of the loss values by appending them to the losses array """ timeit.start('train policy') losses = [] ### PROBLEM 1 ### YOUR CODE HERE # raise NotImplementedError for ep in range(self._training_epochs): data_generator = dataset.random_iterator(self._training_batch_size) for i, batch_x in enumerate(data_generator): states, actions, next_states = batch_x[:3] loss = self._policy.train_step(states, actions, next_states) losses.append(loss) # logger.debug('%s/Epoch, it %s: Loss: %s' % (ep, i, loss)) logger.info('loss start to end: %s, %s' % (losses[0], losses[-1])) logger.record_tabular('TrainingLossStart', losses[0]) logger.record_tabular('TrainingLossFinal', losses[-1]) timeit.stop('train policy')
def _train_policy(self, dataset): """ Train the model-based policy implementation details: (a) Train for self._training_epochs number of epochs (b) The dataset.random_iterator(...) method will iterate through the dataset once in a random order (c) Use self._training_batch_size for iterating through the dataset (d) Keep track of the loss values by appending them to the losses array """ timeit.start('train policy') losses = [] # Added: Training policy iteration for epoch_num in range(self._training_epochs): logger.info('Epoch %i' % (epoch_num + 1)) for batch_num, (states, actions, next_states, _, _) in enumerate( dataset.random_iterator(self._training_batch_size)): loss = self._policy.train_step(states, actions, next_states) losses.append(loss) logger.info('\tLoss: {:.3f}'.format(losses[-1])) logger.record_tabular('TrainingLossStart', losses[0]) logger.record_tabular('TrainingLossFinal', losses[-1]) timeit.stop('train policy')
def _train_policy(self, dataset): """ Train the model-based policy implementation details: (a) Train for self._training_epochs number of epochs (b) The dataset.random_iterator(...) method will iterate through the dataset once in a random order (c) Use self._training_batch_size for iterating through the dataset (d) Keep track of the loss values by appending them to the losses array """ timeit.start('train policy') losses = [] ### PROBLEM 1 ### YOUR CODE HERE for epoch in range(self._training_epochs): for state, action, next_state, _, _ in dataset.random_iterator( self._training_batch_size): loss = self._policy.train_step(states=state, actions=action, next_states=next_state) losses.append(loss) # raise NotImplementedError logger.record_tabular('TrainingLossStart', losses[0]) logger.record_tabular('TrainingLossFinal', losses[-1]) timeit.stop('train policy')
def _train_policy(self, dataset): """ Train the model-based policy implementation details: (a) Train for self._training_epochs number of epochs (b) The dataset.random_iterator(...) method will iterate through the dataset once in a random order (c) Use self._training_batch_size for iterating through the dataset (d) Keep track of the loss values by appending them to the losses array """ timeit.start('train policy') losses = [] ### PROBLEM 1 ### YOUR CODE HERE # (a) Train for self._training_epochs number of epochs for _ in range(self._training_epochs): # (b) The dataset.random_iterator(...) method will iterate through the dataset once in a random order # (c) Use self._training_batch_size for iterating through the dataset epoch_losses = [] for _, (states, actions, next_states, _, _) in enumerate( dataset.random_iterator(self._training_batch_size)): loss = self._policy.train_step(states, actions, next_states) epoch_losses.append(loss) # (d) Keep track of the loss values by appending them to the losses array losses.append(np.mean(epoch_losses)) logger.record_tabular('TrainingLossStart', losses[0]) logger.record_tabular('TrainingLossFinal', losses[-1]) timeit.stop('train policy')
def __init__(self, env, num_init_random_rollouts=10, max_rollout_length=500, num_onplicy_iters=10, num_onpolicy_rollouts=10, training_epochs=60, training_batch_size=512, render=False, mpc_horizon=15, num_random_action_selection=4096, nn_layers=1): self._env = env self._max_rollout_length = max_rollout_length self._num_onpolicy_iters = num_onplicy_iters self._num_onpolicy_rollouts = num_onpolicy_rollouts self._training_epochs = training_epochs self._training_batch_size = training_batch_size self._render = render logger.info('Gathering random dataset') self._random_dataset = self._gather_rollouts(utils.RandomPolicy(env), num_init_random_rollouts) logger.info('Creating policy') self._policy = ModelBasedPolicy( env, self._random_dataset, horizon=mpc_horizon, num_random_action_selection=num_random_action_selection) timeit.reset() timeit.start('total')
def _train_policy(self, dataset): """ Train the model-based policy implementation details: (a) Train for self._training_epochs number of epochs (b) The dataset.random_iterator(...) method will iterate through the dataset once in a random order (c) Use self._training_batch_size for iterating through the dataset (d) Keep track of the loss values by appending them to the losses array """ timeit.start('train policy') losses = [] ### PROBLEM 1 ### YOUR CODE HERE for epoch in range(self._training_epochs): for states, actions, next_states, _, _ in dataset.random_iterator( self._training_batch_size): loss = self._policy.train_step(states, actions, next_states) losses.append(loss) # self._random_dataset_test = self._gather_rollouts(self._policy, 2) # for states, actions, next_states, _, _ in self._random_dataset_test.random_iterator(len(self._random_dataset_test)): # eval_loss = self._policy.eval_loss(states, actions, next_states) # print("Test loss: " + str(eval_loss)) logger.record_tabular('TrainingLossStart', losses[0]) logger.record_tabular('TrainingLossFinal', losses[-1]) timeit.stop('train policy')
def _train_policy(self, dataset): """ Train the model-based policy implementation details: (a) Train for self._training_epochs number of epochs (b) The dataset.random_iterator(...) method will iterate through the dataset once in a random order (c) Use self._training_batch_size for iterating through the dataset (d) Keep track of the loss values by appending them to the losses array """ timeit.start('train policy') losses = [] ### PROBLEM 1 ### YOUR CODE HERE # raise NotImplementedError for _ in range(self._training_epochs): current_batches = dataset.random_iterator(self._training_batch_size) while True: state, action, next_state, _, _ = \ next(current_batches, [None] * 5) if state is None: break loss = self._policy.train_step(state, action, next_state) losses.append(loss) logger.record_tabular('TrainingLossStart', losses[0]) logger.record_tabular('TrainingLossFinal', losses[-1]) timeit.stop('train policy') plt.figure() plt.plot(losses) plt.savefig(os.path.join(logger.dir, 'training.png'))
def _train_policy(self, dataset): """ Train the model-based policy """ timeit.start('train policy') losses = [] ### PROBLEM 1 ### YOUR CODE HERE # (a) Train for self._training_epochs number of epochs for ep in range(self._training_epochs): # (b) The dataset.random_iterator(...) method will iterate through the dataset once in a random order, # it will return a dataset with random sequence # (c) Use self._training_batch_size for iterating through the dataset _iter = dataset.random_iterator(self._training_batch_size) for states, actions, next_states, _, _ in _iter: loss = self._policy.train_step(states, actions, next_states) # dataset_size = dones[0] # batch_start_index = np.array(0,dataset_size, self._training_batch_size) # batch_end_index = batch_start_index + self._training_batch_size # for (start, end) in zip(batch_start_index,batch_end_index): # loss = self._policy.train_step(states[start:end], actions[start:end], next_states[start:end] losses.append(loss) logger.record_tabular('TrainingLossStart', losses[0]) logger.record_tabular('TrainingLossFinal', losses[-1]) timeit.stop('train policy')
def _log(self, dataset): timeit.stop('total') dataset.log() logger.dump_tabular(print_func=logger.info) logger.debug('') for line in str(timeit).split('\n'): logger.debug(line) timeit.reset() timeit.start('total')
def _log(self, dataset): # stop timing timeit.stop('total') # print logging information dataset.log() logger.dump_tabular(print_func=logger.info) logger.debug('') for line in str(timeit).split('\n'): logger.debug(line) # reset timing timeit.reset() timeit.start('total')
def _debug_rollout_and_record(self, policy, num_rollouts): dataset = utils.Dataset() for r_num in range(num_rollouts): state = self._env.reset() done = False t = 0 states = [state] pred_states = [state] while not done: if self._render: timeit.start('render') self._env.render() timeit.stop('render') timeit.start('get action') action, next_state_pred = policy.get_action(state, True) timeit.stop('get action') timeit.start('env step') next_state, reward, done, _ = self._env.step(action) timeit.stop('env step') done = done or (t >= self._max_rollout_length) dataset.add(state, action, next_state, reward, done) state = next_state t += 1 pred_states.append(next_state_pred) states.append(next_state) states = np.array(states) pred_states = np.array(pred_states) self._debug_plot_states(states, pred_states, r_num) return dataset
def _gather_rollouts(self, policy, num_rollouts): dataset = utils.Dataset() for _ in range(num_rollouts): state = self._env.reset() done = False t = 0 while not done: if self._render: timeit.start('render') self._env.render() timeit.stop('render') timeit.start('get action') action = policy.get_action(state) timeit.stop('get action') timeit.start('env step') next_state, reward, done, _ = self._env.step(action) timeit.stop('env step') done = done or (t >= self._max_rollout_length) dataset.add(state, action, next_state, reward, done) state = next_state t += 1 return dataset
def _gather_rollouts(self, policy, num_rollouts): # Initialize to have an empty dataset dataset = utils.Dataset() for i in range(num_rollouts): state = self._env.reset() done = False t = 0 while not done: if self._render: timeit.start('render') self._env.render() timeit.stop('render') timeit.start('get action') action = policy.get_action(state) timeit.stop('get action') timeit.start('env step') # Therefore, here env has the real dynamics next_state, reward, done, _ = self._env.step(action) timeit.stop('env step') # max_rollout_length = 500, therefore there are typically 501 data points done = done or (t >= self._max_rollout_length) dataset.add(state, action, next_state, reward, done) state = next_state t += 1 if t % 100 == 0: print('time step', t) print('rollout', i) return dataset
def _train_policy(self, dataset): # timing for policy training timeit.start('train policy') losses = [] # loop for self._training_epochs for _ in range(self._training_epochs): # iterate over dataset for states, actions, next_states, _, _ in \ dataset.random_iterator(self._training_batch_size): # compute loss loss = self._policy.train_step(states, actions, next_states) losses.append(loss) # perform logging logger.record_tabular('TrainingLossStart', losses[0]) logger.record_tabular('TrainingLossFinal', losses[-1]) timeit.stop('train policy')
def _train_policy(self, dataset): """ Train the model-based policy implementation details: (a) Train for self._training_epochs number of epochs (b) The dataset.random_iterator(...) method will iterate through the dataset once in a random order (c) Use self._training_batch_size for iterating through the dataset (d) Keep track of the loss values by appending them to the losses array """ timeit.start('train policy') ### PROBLEM 1 ### YOUR CODE HERE # raise NotImplementedError print('batch size is', self._training_batch_size) print('epoch size is', self._training_epochs) # Iterate dataset once in an epoch losses = [] for epoch in range(self._training_epochs): t_loss = 0 # Alternative way: # random_data = dataset.random_iterator(self._training_batch_size) # for states, actions, next_states, _, _ in random_data: # Enumerate is to add index! for r_num, (states, actions, next_states, _, _) in enumerate( dataset.random_iterator(self._training_batch_size)): loss = self._policy.train_step(states, actions, next_states) t_loss += loss t_loss = t_loss / (r_num + 1) losses.append(t_loss) # TO-DO: why not print out in q1? logger.record_tabular('TrainingLossStart', losses[0]) logger.record_tabular('TrainingLossFinal', losses[-1]) timeit.stop('train policy')
def _gather_rollouts(self, policy, num_rollouts): # initialize dataset class dataset = utils.Dataset() # loop for num_rollouts for _ in range(num_rollouts): # reset gym env t = 0 done = False state = self._env.reset() # generate gym rollout while not done: # perform rendering if self._render: timeit.start('render') self._env.render() timeit.stop('render') # get action using MPC timeit.start('get action') action = policy.get_action(state) timeit.stop('get action') # step through environment timeit.start('env step') next_state, reward, done, _ = self._env.step(action) timeit.stop('env step') # add experience to dataset done = done or (t >= self._max_rollout_length) dataset.add(state, action, next_state, reward, done) # update state variable t += 1 state = next_state return dataset