def batch_learn(self, trajectory, vec_env, total_timesteps, log_interval, seed, result_filepath=None, **kwargs): np.random.seed(seed) replay_buffer = ReplayBuffer(trajectory, max_action=self.max_action) self.standardizer = replay_buffer.standardizer # Start... start_time = time.time() timestep = 0 eval_timesteps = [] evaluations = [] with tqdm(total=total_timesteps, desc="BC") as pbar: while timestep < total_timesteps: evaluation = evaluate_policy(vec_env, self) eval_timesteps.append(timestep) evaluations.append(evaluation) print('t=%d: %f (elapsed_time=%f)' % (timestep, evaluation, time.time() - start_time)) self.train(replay_buffer, iterations=log_interval, batch_size=64) pbar.update(log_interval) timestep += log_interval if result_filepath: result = {'eval_timesteps': eval_timesteps, 'evals': evaluations, 'info_values': []} np.save(result_filepath + '.tmp.npy', result) return eval_timesteps, evaluations, []
def batch_learn(self, trajectory, vec_env, total_timesteps, log_interval, seed, result_filepath=None, **kwargs): np.random.seed(seed) replay_buffer = ReplayBuffer(trajectory, max_action=self.max_action, num_critic=self.num_critics) self.standardizer = replay_buffer.standardizer # Start... start_time = time.time() eval_timesteps = [] evaluations = [] infos_values = [] for timestep in tqdm(range(total_timesteps), desc="KLAC", ncols=70): obs, action, reward, next_obs, done, ensemble_mask = replay_buffer.sample(self.batch_size) feed_dict = { self.obs_ph: obs, self.action_ph: action, self.reward_ph: reward, self.next_obs_ph: next_obs, self.terminal_ph: done, self.obs_mean: replay_buffer.obs_mean, self.obs_std: replay_buffer.obs_std, self.ensemble_mask: ensemble_mask } step_result = self.sess.run(self.step_ops + self.eval_ops, feed_dict=feed_dict) infos_value = step_result[len(self.step_ops):] ''' if timestep % log_interval == log_interval - 1: from plot_alpha import plot_value plot_value(self.sess, self.standardizer, self.obs_ph, {'train_value': self.critic.v(self.obs_ph), 'density_estimation': self.denest([self.obs_ph])}, replay_buffer.obs, timestep) ''' if timestep % log_interval == 0: evaluation = evaluate_policy(vec_env, self) eval_timesteps.append(timestep) evaluations.append(evaluation) infos_values.append(infos_value) print('t=%d: %f (elapsed_time=%f)' % (timestep, evaluation, time.time() - start_time)) print('\n============================') for label, value in zip(self.eval_labels, infos_value): print('%12s: %10.3f' % (label, value)) print('============================\n') if result_filepath: result = {'eval_timesteps': eval_timesteps, 'evals': evaluations, 'info_values': infos_values} np.save(result_filepath + '.tmp.npy', result) return eval_timesteps, evaluations, infos_values
def batch_learn(self, train_trajectory, vec_env, total_timesteps, log_interval, seed, result_filepath=None, valid_trajectory=None): np.random.seed(seed) train_replay_buffer = ReplayBuffer(train_trajectory, max_action=self.max_action, num_critic=self.num_critics) valid_replay_buffer = ReplayBuffer(valid_trajectory, max_action=self.max_action, num_critic=self.num_critics) valid_replay_buffer.standardizer = train_replay_buffer.standardizer self.standardizer = train_replay_buffer.standardizer # Zero-padding of valid (obs, actions) valid_obs_padded, valid_actions_padded, valid_terminal_mask, valid_traj_maxlen = ReplayBuffer.group_element_trajectory( valid_trajectory) valid_obs_padded, valid_actions_padded = self.standardizer( valid_obs_padded), valid_actions_padded / self.max_action valid_trajectory_indices = np.arange(len(valid_trajectory)) num_updates = 10 # Hyper-gradient ascent operation alpha_train_op, v_grads_s0 = self._get_alpha_train_op( self.critic_q, self.valid_critic_q, valid_traj_maxlen) # Start... saver = tf.train.Saver(max_to_keep=2) last_checkpoint = tf.train.latest_checkpoint(result_filepath + '_checkpoint') if last_checkpoint is not None: start_time = time.time() saver.restore(self.sess, last_checkpoint) loaded = np.load(result_filepath + '.tmp.npy', allow_pickle=True).item() eval_timesteps = loaded['eval_timesteps'] evaluations = loaded['evals'] infos_values = loaded['info_values'] v_grad_list = [] timestep = eval_timesteps[-1] + 1 timesteps = range(timestep, total_timesteps) print('loaded', timestep) print(eval_timesteps) print(infos_values) else: start_time = time.time() eval_timesteps = [] evaluations = [] infos_values = [] v_grad_list = [] timesteps = range(total_timesteps) for timestep in tqdm(timesteps, desc="BOPAHSingle", ncols=70): obs, action, reward, next_obs, done, ensemble_mask = train_replay_buffer.sample( self.batch_size) valid_obs, valid_action, valid_reward, valid_next_obs, valid_done, valid_ensemble_mask = valid_replay_buffer.sample( self.batch_size) feed_dict = { self.obs_ph: obs, self.action_ph: action, self.reward_ph: reward, self.next_obs_ph: next_obs, self.terminal_ph: done, self.valid_obs_ph: valid_obs, self.valid_action_ph: valid_action, self.valid_reward_ph: valid_reward, self.valid_next_obs_ph: valid_next_obs, self.valid_terminal_ph: valid_done, self.obs_mean: train_replay_buffer.obs_mean, self.obs_std: train_replay_buffer.obs_std, self.ensemble_mask: ensemble_mask, self.valid_ensemble_mask: valid_ensemble_mask } step_result = self.sess.run(self.step_ops + self.eval_ops, feed_dict=feed_dict) infos_value = step_result[len(self.step_ops):] if (timestep + 1) % 500 == 0 and timestep > 100000: grad_values = [] np.random.shuffle(valid_trajectory_indices) reshaped_indices = np.reshape(valid_trajectory_indices[:200], [num_updates, -1]) for rind in reshaped_indices: v_grads_s0_value = self.sess.run( v_grads_s0, feed_dict={ self.obs_mean: train_replay_buffer.obs_mean, self.obs_std: train_replay_buffer.obs_std, self.traj_valid_obs_ph: valid_obs_padded[rind], self.traj_valid_actions_ph: valid_actions_padded[rind], self.mask_ph: valid_terminal_mask[rind] }) grad_values += list(v_grads_s0_value.flatten()) # print(negative_q_cov_value[:100, 0]) v_grad_list.append(np.mean(grad_values)) self.sess.run( alpha_train_op, feed_dict={self.gradient_buffer_ph: np.mean(grad_values)}) print('t=%d: (elapsed_time=%f)' % (timestep, time.time() - start_time)) print('\n============================') for label, value in zip(self.eval_labels, infos_value): if label == 'kl_coef': print('%16s: %10.3f' % (label, value)) print('%16s: %10.3f' % ('log_kl_coef', np.log(value))) print('%16s: %10.3f' % ('total_grad_value', np.mean(grad_values))) print('============================\n') if timestep % log_interval == 0: print('-----------saving----------------------') v_grad_mean = np.mean(v_grad_list) v_grad_list = [] evaluation = evaluate_policy(vec_env, self) eval_timesteps.append(timestep) evaluations.append(evaluation) infos_values.append(infos_value + [v_grad_mean]) print('t=%d: %f (elapsed_time=%f)' % (timestep, evaluation, time.time() - start_time)) print('\n============================') for label, value in zip(self.eval_labels, infos_value): print('%12s: %10.3f' % (label, value)) print('============================\n') if result_filepath: result = { 'eval_timesteps': eval_timesteps, 'evals': evaluations, 'info_values': infos_values } np.save(result_filepath + '.tmp.npy', result) saver.save(self.sess, result_filepath + '_checkpoint/model') return eval_timesteps, evaluations, infos_values
def batch_learn(self, train_trajectory, vec_env, total_timesteps, log_interval, seed, result_filepath=None, valid_trajectory=None): # Start... start_time = time.time() eval_timesteps = [] evaluations = [] infos_values = [] hypergrad_lr = 1e-2 for timestep in tqdm(range(total_timesteps), desc="BOPAH", ncols=70): step_result = self.sess.run(self.step_ops + self.eval_ops) infos_value = step_result[len(self.step_ops):] if (timestep + 1) % 500 == 0 and timestep > 100000: state_dep_grad_values = [] state_ind_grad_values = [] for i in range(40): state_dep_output, state_ind_output = self.sess.run( [self.state_dep_v_grads_s0, self.state_ind_v_grads_s0]) state_dep_grad_values += list(state_dep_output) state_ind_grad_values += list(state_ind_output) self.sess.run(self.coef_train_op, feed_dict={ self.state_dep_gradient_buffer_ph: np.mean(state_dep_grad_values, axis=0, keepdims=True).T, self.state_ind_gradient_buffer_ph: np.mean(state_ind_grad_values) }) print('\n============================') for label, value in zip(self.eval_labels, infos_value): if label == 'kl_coef': print('%16s: %10.3f' % (label, value)) print('%16s: %10.3f' % ('log_kl_coef', np.log(value))) print('%16s: %10.3f' % ('total_dep_grad_value', np.mean(state_dep_grad_values))) print('%16s: %10.3f' % ('total_ind_grad_value', np.mean(state_ind_grad_values))) print(np.transpose(self.sess.run(self.kl_coef_vec))) print('============================\n') if timestep % log_interval == 0: evaluation = evaluate_policy(vec_env, self) eval_timesteps.append(timestep) evaluations.append(evaluation) infos_values.append(infos_value) print('t=%d: %f (elapsed_time=%f)' % (timestep, evaluation, time.time() - start_time)) print('\n============================') for label, value in zip(self.eval_labels, infos_value): print('%12s: %10.3f' % (label, value)) print('============================\n') if result_filepath: result = { 'eval_timesteps': eval_timesteps, 'evals': evaluations, 'info_values': infos_values } np.save(result_filepath + '.tmp.npy', result) if timestep % log_interval == 0 and self.state_dim == 3: from plot_alpha import plot_value_pendulum plot_value_pendulum( self.sess, self.standardizer, self.obs_ph, { 'train_value': self.train_value, 'valid_value': self.valid_value, 'coef_log': self.coef_to_plot }, self.train_obs, "{}, {}".format(timestep, evaluations[-1]), self.rbf_means * self.obs_std + self.obs_mean) if timestep % log_interval == 0 and self.state_dim == 2: from plot_alpha import plot_value_mc plot_value_mc( self.sess, self.standardizer, self.obs_ph, { 'train_value': self.train_value, 'valid_value': self.valid_value, 'coef_log': self.coef_to_plot }, self.train_obs, "{}, {}".format(timestep, evaluations[-1]), self.rbf_means * self.obs_std + self.obs_mean) return eval_timesteps, evaluations, infos_values