コード例 #1
0
    def batch_learn(self, trajectory, vec_env, total_timesteps, log_interval, seed, result_filepath=None, **kwargs):
        np.random.seed(seed)

        replay_buffer = ReplayBuffer(trajectory, max_action=self.max_action)
        self.standardizer = replay_buffer.standardizer

        # Start...
        start_time = time.time()
        timestep = 0
        eval_timesteps = []
        evaluations = []
        with tqdm(total=total_timesteps, desc="BC") as pbar:
            while timestep < total_timesteps:
                evaluation = evaluate_policy(vec_env, self)
                eval_timesteps.append(timestep)
                evaluations.append(evaluation)
                print('t=%d: %f (elapsed_time=%f)' % (timestep, evaluation, time.time() - start_time))

                self.train(replay_buffer, iterations=log_interval, batch_size=64)
                pbar.update(log_interval)
                timestep += log_interval

                if result_filepath:
                    result = {'eval_timesteps': eval_timesteps, 'evals': evaluations, 'info_values': []}
                    np.save(result_filepath + '.tmp.npy', result)

        return eval_timesteps, evaluations, []
コード例 #2
0
ファイル: klac.py プロジェクト: jypark/BOPAH
    def batch_learn(self, trajectory, vec_env, total_timesteps, log_interval, seed, result_filepath=None, **kwargs):
        np.random.seed(seed)

        replay_buffer = ReplayBuffer(trajectory, max_action=self.max_action, num_critic=self.num_critics)
        self.standardizer = replay_buffer.standardizer

        # Start...
        start_time = time.time()
        eval_timesteps = []
        evaluations = []
        infos_values = []
        for timestep in tqdm(range(total_timesteps), desc="KLAC", ncols=70):
            obs, action, reward, next_obs, done, ensemble_mask = replay_buffer.sample(self.batch_size)
            feed_dict = {
                self.obs_ph: obs, self.action_ph: action, self.reward_ph: reward,
                self.next_obs_ph: next_obs, self.terminal_ph: done,
                self.obs_mean: replay_buffer.obs_mean, self.obs_std: replay_buffer.obs_std,
                self.ensemble_mask: ensemble_mask
            }
            step_result = self.sess.run(self.step_ops + self.eval_ops, feed_dict=feed_dict)
            infos_value = step_result[len(self.step_ops):]
            '''
            if timestep % log_interval == log_interval - 1:
                from plot_alpha import plot_value
                plot_value(self.sess, self.standardizer, self.obs_ph, 
                {'train_value': self.critic.v(self.obs_ph), 'density_estimation': self.denest([self.obs_ph])}, 
                replay_buffer.obs, timestep)
            '''
            if timestep % log_interval == 0:
                evaluation = evaluate_policy(vec_env, self)
                eval_timesteps.append(timestep)
                evaluations.append(evaluation)
                infos_values.append(infos_value)
                print('t=%d: %f (elapsed_time=%f)' % (timestep, evaluation, time.time() - start_time))
                print('\n============================')
                for label, value in zip(self.eval_labels, infos_value):
                    print('%12s: %10.3f' % (label, value))
                print('============================\n')

                if result_filepath:
                    result = {'eval_timesteps': eval_timesteps, 'evals': evaluations, 'info_values': infos_values}
                    np.save(result_filepath + '.tmp.npy', result)

        return eval_timesteps, evaluations, infos_values
コード例 #3
0
ファイル: bopah_single.py プロジェクト: jypark/BOPAH
    def batch_learn(self,
                    train_trajectory,
                    vec_env,
                    total_timesteps,
                    log_interval,
                    seed,
                    result_filepath=None,
                    valid_trajectory=None):
        np.random.seed(seed)

        train_replay_buffer = ReplayBuffer(train_trajectory,
                                           max_action=self.max_action,
                                           num_critic=self.num_critics)
        valid_replay_buffer = ReplayBuffer(valid_trajectory,
                                           max_action=self.max_action,
                                           num_critic=self.num_critics)
        valid_replay_buffer.standardizer = train_replay_buffer.standardizer
        self.standardizer = train_replay_buffer.standardizer

        # Zero-padding of valid (obs, actions)
        valid_obs_padded, valid_actions_padded, valid_terminal_mask, valid_traj_maxlen = ReplayBuffer.group_element_trajectory(
            valid_trajectory)
        valid_obs_padded, valid_actions_padded = self.standardizer(
            valid_obs_padded), valid_actions_padded / self.max_action
        valid_trajectory_indices = np.arange(len(valid_trajectory))
        num_updates = 10

        # Hyper-gradient ascent operation
        alpha_train_op, v_grads_s0 = self._get_alpha_train_op(
            self.critic_q, self.valid_critic_q, valid_traj_maxlen)

        # Start...
        saver = tf.train.Saver(max_to_keep=2)
        last_checkpoint = tf.train.latest_checkpoint(result_filepath +
                                                     '_checkpoint')
        if last_checkpoint is not None:
            start_time = time.time()
            saver.restore(self.sess, last_checkpoint)
            loaded = np.load(result_filepath + '.tmp.npy',
                             allow_pickle=True).item()
            eval_timesteps = loaded['eval_timesteps']
            evaluations = loaded['evals']
            infos_values = loaded['info_values']
            v_grad_list = []
            timestep = eval_timesteps[-1] + 1
            timesteps = range(timestep, total_timesteps)
            print('loaded', timestep)
            print(eval_timesteps)
            print(infos_values)
        else:
            start_time = time.time()
            eval_timesteps = []
            evaluations = []
            infos_values = []
            v_grad_list = []
            timesteps = range(total_timesteps)
        for timestep in tqdm(timesteps, desc="BOPAHSingle", ncols=70):
            obs, action, reward, next_obs, done, ensemble_mask = train_replay_buffer.sample(
                self.batch_size)
            valid_obs, valid_action, valid_reward, valid_next_obs, valid_done, valid_ensemble_mask = valid_replay_buffer.sample(
                self.batch_size)
            feed_dict = {
                self.obs_ph: obs,
                self.action_ph: action,
                self.reward_ph: reward,
                self.next_obs_ph: next_obs,
                self.terminal_ph: done,
                self.valid_obs_ph: valid_obs,
                self.valid_action_ph: valid_action,
                self.valid_reward_ph: valid_reward,
                self.valid_next_obs_ph: valid_next_obs,
                self.valid_terminal_ph: valid_done,
                self.obs_mean: train_replay_buffer.obs_mean,
                self.obs_std: train_replay_buffer.obs_std,
                self.ensemble_mask: ensemble_mask,
                self.valid_ensemble_mask: valid_ensemble_mask
            }
            step_result = self.sess.run(self.step_ops + self.eval_ops,
                                        feed_dict=feed_dict)
            infos_value = step_result[len(self.step_ops):]

            if (timestep + 1) % 500 == 0 and timestep > 100000:
                grad_values = []
                np.random.shuffle(valid_trajectory_indices)
                reshaped_indices = np.reshape(valid_trajectory_indices[:200],
                                              [num_updates, -1])
                for rind in reshaped_indices:
                    v_grads_s0_value = self.sess.run(
                        v_grads_s0,
                        feed_dict={
                            self.obs_mean: train_replay_buffer.obs_mean,
                            self.obs_std: train_replay_buffer.obs_std,
                            self.traj_valid_obs_ph: valid_obs_padded[rind],
                            self.traj_valid_actions_ph:
                            valid_actions_padded[rind],
                            self.mask_ph: valid_terminal_mask[rind]
                        })
                    grad_values += list(v_grads_s0_value.flatten())
#                     print(negative_q_cov_value[:100, 0])
                v_grad_list.append(np.mean(grad_values))
                self.sess.run(
                    alpha_train_op,
                    feed_dict={self.gradient_buffer_ph: np.mean(grad_values)})

                print('t=%d: (elapsed_time=%f)' %
                      (timestep, time.time() - start_time))
                print('\n============================')
                for label, value in zip(self.eval_labels, infos_value):
                    if label == 'kl_coef':
                        print('%16s: %10.3f' % (label, value))
                        print('%16s: %10.3f' % ('log_kl_coef', np.log(value)))
                print('%16s: %10.3f' %
                      ('total_grad_value', np.mean(grad_values)))
                print('============================\n')

            if timestep % log_interval == 0:
                print('-----------saving----------------------')
                v_grad_mean = np.mean(v_grad_list)
                v_grad_list = []
                evaluation = evaluate_policy(vec_env, self)
                eval_timesteps.append(timestep)
                evaluations.append(evaluation)
                infos_values.append(infos_value + [v_grad_mean])
                print('t=%d: %f (elapsed_time=%f)' %
                      (timestep, evaluation, time.time() - start_time))
                print('\n============================')
                for label, value in zip(self.eval_labels, infos_value):
                    print('%12s: %10.3f' % (label, value))
                print('============================\n')

                if result_filepath:
                    result = {
                        'eval_timesteps': eval_timesteps,
                        'evals': evaluations,
                        'info_values': infos_values
                    }
                    np.save(result_filepath + '.tmp.npy', result)
                    saver.save(self.sess,
                               result_filepath + '_checkpoint/model')

        return eval_timesteps, evaluations, infos_values
コード例 #4
0
    def batch_learn(self,
                    train_trajectory,
                    vec_env,
                    total_timesteps,
                    log_interval,
                    seed,
                    result_filepath=None,
                    valid_trajectory=None):
        # Start...
        start_time = time.time()
        eval_timesteps = []
        evaluations = []
        infos_values = []
        hypergrad_lr = 1e-2
        for timestep in tqdm(range(total_timesteps), desc="BOPAH", ncols=70):
            step_result = self.sess.run(self.step_ops + self.eval_ops)
            infos_value = step_result[len(self.step_ops):]

            if (timestep + 1) % 500 == 0 and timestep > 100000:
                state_dep_grad_values = []
                state_ind_grad_values = []
                for i in range(40):
                    state_dep_output, state_ind_output = self.sess.run(
                        [self.state_dep_v_grads_s0, self.state_ind_v_grads_s0])
                    state_dep_grad_values += list(state_dep_output)
                    state_ind_grad_values += list(state_ind_output)
                self.sess.run(self.coef_train_op,
                              feed_dict={
                                  self.state_dep_gradient_buffer_ph:
                                  np.mean(state_dep_grad_values,
                                          axis=0,
                                          keepdims=True).T,
                                  self.state_ind_gradient_buffer_ph:
                                  np.mean(state_ind_grad_values)
                              })
                print('\n============================')
                for label, value in zip(self.eval_labels, infos_value):
                    if label == 'kl_coef':
                        print('%16s: %10.3f' % (label, value))
                        print('%16s: %10.3f' % ('log_kl_coef', np.log(value)))
                print('%16s: %10.3f' %
                      ('total_dep_grad_value', np.mean(state_dep_grad_values)))
                print('%16s: %10.3f' %
                      ('total_ind_grad_value', np.mean(state_ind_grad_values)))
                print(np.transpose(self.sess.run(self.kl_coef_vec)))
                print('============================\n')

            if timestep % log_interval == 0:
                evaluation = evaluate_policy(vec_env, self)
                eval_timesteps.append(timestep)
                evaluations.append(evaluation)
                infos_values.append(infos_value)
                print('t=%d: %f (elapsed_time=%f)' %
                      (timestep, evaluation, time.time() - start_time))
                print('\n============================')
                for label, value in zip(self.eval_labels, infos_value):
                    print('%12s: %10.3f' % (label, value))
                print('============================\n')

                if result_filepath:
                    result = {
                        'eval_timesteps': eval_timesteps,
                        'evals': evaluations,
                        'info_values': infos_values
                    }
                    np.save(result_filepath + '.tmp.npy', result)

            if timestep % log_interval == 0 and self.state_dim == 3:
                from plot_alpha import plot_value_pendulum
                plot_value_pendulum(
                    self.sess, self.standardizer, self.obs_ph, {
                        'train_value': self.train_value,
                        'valid_value': self.valid_value,
                        'coef_log': self.coef_to_plot
                    }, self.train_obs, "{}, {}".format(timestep,
                                                       evaluations[-1]),
                    self.rbf_means * self.obs_std + self.obs_mean)
            if timestep % log_interval == 0 and self.state_dim == 2:
                from plot_alpha import plot_value_mc
                plot_value_mc(
                    self.sess, self.standardizer, self.obs_ph, {
                        'train_value': self.train_value,
                        'valid_value': self.valid_value,
                        'coef_log': self.coef_to_plot
                    }, self.train_obs, "{}, {}".format(timestep,
                                                       evaluations[-1]),
                    self.rbf_means * self.obs_std + self.obs_mean)

        return eval_timesteps, evaluations, infos_values