def __init__(self, env, policy, params=None, discrim_early_low_term=False, best=True, discrim_time_limit=None, discrim_online=True):
        utils.EzPickle.__init__(**locals())
        if not isinstance(env, Hierarchical):
            raise ValueError("Must be created with a Hierarchical Environment")
        # Get the discriminator params
        if not params:
            from bot_transfer.utils.loader import ModelParams
            discrim_params = ModelParams.load(policy)
        else:
            discrim_params = params
        discrim_params['env'] = discrim_params['env'].split('_')[0]
        # Avoid possible recursion
        if 'discrim_policy' in discrim_params['env_wrapper_args']:
            del discrim_params['env_wrapper_args']['discrim_policy']
        if not discrim_time_limit:
            discrim_time_limit = discrim_params['time_limit']
        discrim_params['time_limit'] = None
        # get the discriminator Environment
        from bot_transfer.utils.loader import get_env
        self.expert_env = get_env(discrim_params)
        if not isinstance(self.expert_env, Hierarchical):
            raise ValueError("Expert Environment must also be Hierarchical")
        # Get the expert policy
        if isinstance(policy, str):
            from bot_transfer.utils.loader import load
            expert_model, _ = load(policy, discrim_params, load_env=False, best=best)
        else:
            expert_model = policy

        self.env = env
        self.observation_space = self.env.agent_state_space()
        self.action_space = self.env.action_space()
        # option for expert env to auto terminate when we reach the goal.
        self.expert_env.early_low_termination = discrim_early_low_term
        self.expert_pred_fn = expert_model.predict
        self.expert_state = self.expert_env.agent_state_func(self.expert_env.state())
        self.expert_time_limit = discrim_time_limit
        self.expert_time_step = 0
        self.prev_agent_obs = None
        self.prev_expert_obs = None
        self.discrim_online = discrim_online
Beispiel #2
0
import os
import bot_transfer
import argparse
import stable_baselines

parser = argparse.ArgumentParser()

parser.add_argument("--low", "-l", type=str, required=True)
parser.add_argument("--high", "-m", type=str, required=True)
parser.add_argument("--time-limit", "-k", type=int)
parser.add_argument("--finetune-time-limit", "-f", type=int)
parser.add_argument("--timesteps", "-t", type=int, default=2000)

args = parser.parse_args()

params = ModelParams.load(args.low)
assert params['env'].endswith("_Low")

params['env_wrapper_args']['policy'] = args.high

if args.finetune_time_limit:
    params['env_wrapper_args'][
        'finetune_time_limit'] = args.finetune_time_limit
if args.time_limit:
    params['time_limit'] = args.time_limit

params['env'] = '_'.join([params['env'].split('_')[0], 'LowFinetune'])

model, _ = load(args.low, params, load_env=False, best=True)
if isinstance(model, stable_baselines.SAC):
    model.learning_starts = 0
def composition_sweep(low_names, high_names, env_name=None, k=None, num_ep=100, success=False):
    import time
    f = open(os.getcwd() + '/data/comparison.results' + str(int(round(time.time() * 10000000))), 'w+')
    for high_name in high_names:
        data = list()
        for low_name in low_names:

            # Determine if we are dealing with a single seed directory or multiple seeds.
            try:
                test_load = ModelParams.load(low_name)
                test_load = ModelParams.load(high_name)
                multi_seed = False
            except ValueError:
                multi_seed = True

            if multi_seed:
                file_location_low = os.path.join(BASE, low_name) if not low_name.startswith('/') else low_name
                file_location_high = os.path.join(BASE, high_name) if not high_name.startswith('/') else high_name
                low_runs = sorted([os.path.join(low_name, run) for run in os.listdir(file_location_low) if not run.endswith('.log')])
                high_runs = sorted([os.path.join(high_name, run) for run in os.listdir(file_location_high) if not run.endswith('.log')])
                print(low_runs)
                print(high_runs)
                assert len(low_runs) == len(high_runs)
                run_list = zip(low_runs, high_runs)
            else:
                run_list = [(low_name, high_name)]

            seed_rewards = list()
            f.write("----------------------------------------------\n")
            for run_low, run_high in run_list:
                print("Composing", run_low, "with", run_high)
                params = compose_params(run_low, run_high, env_name=env_name, k=k)
                print("COMPOSED PARAMS", params)
                
                ep_rewards = list()
                model, env = load(run_high, params, best=True)
                obs = env.reset()
                for _ in range(num_ep):
                    rewards = list()
                    while True:
                        action, _states = model.predict(obs)
                        obs, reward, done, info = env.step(action)
                        rewards.append(reward)
                        if done:
                            if success:
                                val_to_add = 1.0 if sum(rewards) > 0 else 0.0
                            else:
                                val_to_add = sum(rewards)
                            ep_rewards.append(val_to_add)
                            obs = env.reset()
                            break
                env.close()
                del model
                del env
                seed_rew_mean = np.mean(ep_rewards)
                seed_rewards.append(seed_rew_mean)

                print("==============================")
                print("Run:", run_low, run_high, ":", seed_rew_mean)
                write_str = run_low + "\t" + run_high + "\t" + str(seed_rew_mean) + "\n"
                f.write(write_str)

            data.append((low_name, np.mean(seed_rewards), np.std(seed_rewards)))

        # print the resulting output
        print("=================================================")
        print("Results for high policy" + high_name)
        for name, score, std in data:
            print('{:<60} {:.2f} {:.2f}'.format(name[-55:], score, std))
        # Write it to a file
        f.write("==================== FINAL RESULTS ==========================\n")
        f.write("== Results for High Level: " + high_name + "\n")
        for name, score, std in data:
            f.write('{:<60} {:.2f} {:.2f}\n'.format(name[-55:], score, std))

    f.close()
    
    

    

    
def params_from_args_hrl(args):
    # Required Arguments
    low_params = ModelParams(args.env + '_Low', args.alg)
    high_params = ModelParams(args.env + '_High', args.alg)

    # Optional Arguments
    if not args.name is None:
        low_params['name'] = args.name
        high_params['name'] = args.name
    if not args.tensorboard is None:
        low_params['tensorboard'] = args.tensorboard
        high_params['tensorboard'] = args.tensorboard
    if not args.timesteps is None:
        high_params['timesteps'] = args.timesteps
        if not args.skip is None:
            low_params['timesteps'] = args.timesteps * args.skip
    if not args.time_limit is None:
        high_params["time_limit"] = args.time_limit
        if not args.skip is None:
            low_params["time_limit"] = args.skip
    if not args.num_proc is None:
        low_params['num_proc'] = args.num_proc
        high_params['num_proc'] = args.num_proc

    # Env Arguments
    if not args.delta_max is None:
        low_params['env_args']['delta_max'] = args.delta_max
        high_params['env_args']['delta_max'] = args.delta_max
    if not args.early_low_termination is None:
        low_params['env_args'][
            'early_low_termination'] = args.early_low_termination
        high_params['env_args'][
            'early_low_termination'] = args.early_low_termination
    if not args.action_penalty is None:
        low_params['env_args']['action_penalty'] = args.action_penalty
        high_params['env_args']['action_penalty'] = args.action_penalty
    if not args.skill_penalty is None:
        low_params['env_args']['skill_penalty'] = args.skill_penalty
        high_params['env_args']['skill_penalty'] = args.skill_penalty
    if not args.skip is None:
        low_params['env_args']['k'] = args.skip
        high_params['env_args']['k'] = args.skip
    if not args.agent_size is None:
        low_params['env_args']['agent_size'] = args.agent_size
        high_params['env_args']['agent_size'] = args.agent_size
    if not args.reset_prob is None:
        low_params['env_args']['reset_prob'] = args.reset_prob
        high_params['env_args']['reset_prob'] = args.reset_prob
    if not args.gear is None:
        low_params['env_args']['gear'] = args.gear
        high_params['env_args']['gear'] = args.gear
    if not args.max_sequential_low is None:
        low_params['env_args']['max_sequential_low'] = args.max_sequential_low
        high_params['env_args']['max_sequential_low'] = args.max_sequential_low
    if not args.ant_density is None:
        low_params['env_args']['ant_density'] = args.ant_density
        high_params['env_args']['ant_density'] = args.ant_density
    if not args.ant_mass is None:
        low_params['env_args']['ant_mass'] = args.ant_mass
        high_params['env_args']['ant_mass'] = args.ant_mass

    # Alg Args
    if not args.learning_starts is None:
        low_params['alg_args']['learning_starts'] = args.learning_starts
        high_params['alg_args']['learning_starts'] = args.learning_starts
    if not args.n_steps is None:
        low_params['alg_args']['n_steps'] = args.n_steps
        high_params['alg_args']['n_steps'] = args.n_steps
    # Low Alg Args
    if not args.low_layers is None:
        low_params['policy_args']['layers'] = args.low_layers
    if not args.low_learning_rate is None:
        low_params['alg_args']['learning_rate'] = args.low_learning_rate
    if not args.low_actor_lr is None:
        low_params['alg_args']['actor_lr'] = args.low_actor_lr
    if not args.low_critic_lr is None:
        low_params['alg_args']['critic_lr'] = args.low_critic_lr
    if not args.low_batch_size is None:
        low_params['alg_args']['batch_size'] = args.low_batch_size
    if not args.low_nminibatches is None:
        low_params['alg_args']['nminibatches'] = args.low_nminibatches
    if not args.low_noptepochs is None:
        low_params['alg_args']['noptepochs'] = args.low_noptepochs
    if not args.low_buffer_size is None:
        low_params['alg_args']['buffer_size'] = args.low_buffer_size
    if not args.low_gradient_steps is None:
        low_params['alg_args']['gradient_steps'] = args.low_gradient_steps
    # High Alg Args
    if not args.high_layers is None:
        high_params['policy_args']['layers'] = args.high_layers
    if not args.high_learning_rate is None:
        high_params['alg_args']['learning_rate'] = args.high_learning_rate
    if not args.high_actor_lr is None:
        high_params['alg_args']['actor_lr'] = args.high_actor_lr
    if not args.high_critic_lr is None:
        high_params['alg_args']['critic_lr'] = args.high_critic_lr
    if not args.high_batch_size is None:
        high_params['alg_args']['batch_size'] = args.high_batch_size
    if not args.high_nminibatches is None:
        high_params['alg_args']['nminibatches'] = args.high_nminibatches
    if not args.high_noptepochs is None:
        high_params['alg_args']['noptepochs'] = args.high_noptepochs
    if not args.high_buffer_size is None:
        high_params['alg_args']['buffer_size'] = args.high_buffer_size
    if not args.high_gradient_steps is None:
        high_params['alg_args']['gradient_steps'] = args.high_gradient_steps

    return low_params, high_params
def params_from_args(args):
    # Required Arguments
    params = ModelParams(args.env, args.alg)
    # Optional Arguments
    if not args.name is None:
        params['name'] = args.name
    if not args.tensorboard is None:
        params['tensorboard'] = args.tensorboard
    if not args.timesteps is None:
        params['timesteps'] = args.timesteps
    if not args.checkpoint_freq is None:
        params['checkpoint_freq'] = args.checkpoint_freq
    if not args.eval_freq is None:
        params['eval_freq'] = args.eval_freq
    if not args.time_limit is None:
        params["time_limit"] = args.time_limit
    if not args.seed is None:
        params["seed"] = args.seed
    if not args.policy is None:
        params['env_wrapper_args']['policy'] = args.policy
    if not args.params is None:
        params['env_wrapper_args']['params'] = args.params
    if not args.buffer_size is None:
        params['alg_args']['buffer_size'] = args.buffer_size
    if not args.layers is None:
        params['policy_args']['layers'] = args.layers
    if not args.skip is None:
        params['env_args']['k'] = args.skip
    if not args.learning_starts is None:
        params['alg_args']['learning_starts'] = args.learning_starts
    if not args.gradient_steps is None:
        params['alg_args']['gradient_steps'] = args.gradient_steps
    if not args.num_proc is None:
        params['num_proc'] = args.num_proc
    if not args.normalize is None:
        params['normalize'] = args.normalize
    if not args.delta_max is None:
        params['env_args']['delta_max'] = args.delta_max
    if not args.early_low_termination is None:
        params['env_args'][
            'early_low_termination'] = args.early_low_termination
    if not args.action_penalty is None:
        params['env_args']['action_penalty'] = args.action_penalty
    if not args.skill_penalty is None:
        params['env_args']['skill_penalty'] = args.skill_penalty
    if not args.agent_size is None:
        params['env_args']['agent_size'] = args.agent_size
    if not args.reset_prob is None:
        params['env_args']['reset_prob'] = args.reset_prob
    if not args.gear is None:
        params['env_args']['gear'] = args.gear
    if not args.max_sequential_low is None:
        params['env_args']['max_sequential_low'] = args.max_sequential_low
    if not args.ant_density is None:
        params['env_args']['ant_density'] = args.ant_density
    if not args.ant_mass is None:
        params['env_args']['ant_mass'] = args.ant_mass
    if not args.include_contacts is None:
        params['env_args']['include_contacts'] = args.include_contacts
        print("#####################")
        print("Contacts arg was", args.include_contacts)
    if not args.use_relative is None:
        params['env_args']['use_relative'] = args.use_relative
    if not args.remove_table is None:
        params['env_args']['remove_table'] = args.remove_table
    if not args.tau is None:
        params['alg_args']['tau'] = args.tau
    # Discrim args
    if not args.discrim_buffer_size is None:
        params['alg_args']['discrim_buffer_size'] = args.discrim_buffer_size
    if not args.discrim_layers is None:
        params['alg_args']['discrim_layers'] = args.discrim_layers
    if not args.discrim_learning_rate is None:
        params['alg_args'][
            'discrim_learning_rate'] = args.discrim_learning_rate
    if not args.discrim_weight is None:
        params['alg_args']['discrim_weight'] = args.discrim_weight
    if not args.discrim_clip is None:
        params['alg_args']['discrim_clip'] = args.discrim_clip
    if not args.discrim_batch_size is None:
        params['alg_args']['discrim_batch_size'] = args.discrim_batch_size
    if not args.discrim_time_limit is None:
        params['env_wrapper_args'][
            'discrim_time_limit'] = args.discrim_time_limit
    if not args.discrim_early_low_term is None:
        params['env_wrapper_args'][
            'discrim_early_low_term'] = args.discrim_early_low_term
    if not args.discrim_train_freq is None:
        params['alg_args']['discrim_train_freq'] = args.discrim_train_freq
    if not args.discrim_stop is None:
        params['alg_args']['discrim_stop'] = args.discrim_stop
    if not args.discrim_coef is None:
        params['alg_args']['discrim_coef'] = args.discrim_coef
    if not args.discrim_decay is None:
        params['alg_args']['discrim_decay'] = args.discrim_decay
    if not args.discrim_include_next_state is None:
        params['alg_args'][
            'discrim_include_next_state'] = args.discrim_include_next_state
    if not args.discrim_include_skill is None:
        params['alg_args'][
            'discrim_include_skill'] = args.discrim_include_skill
    if not args.discrim_online is None:
        params['env_wrapper_args']['discrim_online'] = args.discrim_online
    if not args.finetune_time_limit is None:
        params['env_wrapper_args'][
            'finetune_time_limit'] = args.finetune_time_limit
    if not args.random_exploration is None:
        params['alg_args']['random_exploration'] = args.random_exploration
    if not args.sample_goals is None:
        params['env_args']['sample_goals'] = args.sample_goals
    if not args.intermediate_steps is None:
        params['env_wrapper_args'][
            'intermediate_steps'] = args.intermediate_steps
    if not args.rand_low_init is None:
        params['env_args']['rand_low_init'] = args.rand_low_init
    if not args.use_velocity is None:
        params['env_args']['use_velocity'] = args.use_velocity
    if not args.add_extra_z is None:
        params['env_args']['add_extra_z'] = args.add_extra_z
    # KL Policy Args
    if not args.kl_policy is None:
        params['alg_args']['kl_policy'] = args.kl_policy
    if not args.kl_type is None:
        params['alg_args']['kl_type'] = args.kl_type
    if not args.kl_coef is None:
        params['alg_args']['kl_coef'] = args.kl_coef
    if not args.kl_decay is None:
        params['alg_args']['kl_decay'] = args.kl_decay
    if not args.kl_stop is None:
        params['alg_args']['kl_stop'] = args.kl_stop

    # Previously searchable args
    if not args.optim_stepsize is None:
        params['alg_args']['optim_stepsize'] = args.optim_stepsize
    if not args.learning_rate is None:
        params['alg_args']['learning_rate'] = args.learning_rate
    if not args.actor_lr is None:
        params['alg_args']['actor_lr'] = args.actor_lr
    if not args.critic_lr is None:
        params['alg_args']['critic_lr'] = args.critic_lr
    if not args.batch_size is None:
        params['alg_args']['batch_size'] = args.batch_size
    if not args.gamma is None:
        params['alg_args']['gamma'] = args.gamma
    if not args.noise is None:
        params['noise'] = args.noise
    if not args.nminibatches is None:
        params['alg_args']['nminibatches'] = args.nminibatches
    if not args.n_steps is None:
        params['alg_args']['n_steps'] = args.n_steps
    if not args.noptepochs is None:
        params['alg_args']['noptepochs'] = args.noptepochs
    if not args.vertical_bonus is None:
        params['env_args']['vertical_bonus'] = args.vertical_bonus
    if not args.reach_task is None:
        params['env_args']['reach_task'] = args.reach_task

    return params
def callback(_locals, _globals, data_dir, freq=None, low_level_data_dir=None, checkpoint_freq=None):
    """
    Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2)
    :param _locals: (dict)
    :param _globals: (dict)
    """
    if not freq:
        freq = 100000
    global n_steps, best_mean_reward
    # Print stats every freq calls
    if (n_steps + 1) % freq == 0:
        if low_level_data_dir:
            x, y = ts2xy(load_results(data_dir), 'timesteps')
            if len(x) > 0:
                mean_reward = np.mean(y[-200:])
                print(x[-1], 'timesteps')
                print("Best 200 mean reward: {:.2f} - Last 2000 mean reward per episode: {:.2f}".format(best_mean_reward, mean_reward))

                # New best model, you could save the agent here
                if mean_reward > best_mean_reward:
                    best_mean_reward = mean_reward
                    # Example for saving best model
                    print("Saving new best model.")
                    _locals['self'].save(low_level_data_dir + '/best_model', data_dir + '/best_model')
        else:
            params = ModelParams.load(data_dir)
            env = get_env(params)
            ep_rewards = list()
            for _ in range(4):
                rewards = list()
                obs = env.reset()
                while True:
                    ac = _locals['self'].predict(obs)
                    obs, reward, done, _ = env.step(ac[0])
                    rewards.append(reward)
                    if done:
                        break
                ep_rewards.append(sum(rewards))
            
            mean_reward = sum(ep_rewards) / 100.0
            print("Best 100 mean reward: {:.2f} -  Last mean 100 Ep reward: {:.2f}".format(best_mean_reward, mean_reward))
            if mean_reward > best_mean_reward:
                best_mean_reward = mean_reward
                # Example for saving best model
                print("Saving new best model.")
                _locals['self'].save(data_dir + '/best_model')
            del env

        '''
        # Evaluate policy training performance
        x, y = ts2xy(load_results(data_dir), 'timesteps')
        if len(x) > 0:
            mean_reward = np.mean(y[-200:])
            print(x[-1], 'timesteps')
            print("Best 200 mean reward: {:.2f} - Last 2000 mean reward per episode: {:.2f}".format(best_mean_reward, mean_reward))

            # New best model, you could save the agent here
            if mean_reward > best_mean_reward:
                best_mean_reward = mean_reward
                # Example for saving best model
                print("Saving new best model")
                if low_level_data_dir:
                    _locals['self'].save(low_level_data_dir + '/best_model', data_dir + '/best_model')
                else:
                    _locals['self'].save(data_dir + '/best_model')
        '''
    if not checkpoint_freq is None and (n_steps + 1) % checkpoint_freq == 0:    
        print("Saving Model Checkpoint")
        name = "/checkpoint_" + str(n_steps + 1)
        if low_level_data_dir:
            _locals['self'].save(low_level_data_dir + name, data_dir + name)
        else:
            _locals['self'].save(data_dir + name)

    n_steps += 1
    return True
Beispiel #7
0
import os
import bot_transfer
import argparse

parser = argparse.ArgumentParser()

parser.add_argument("--low", "-l", type=str, required=True)
parser.add_argument("--high", "-m", type=str, required=True)
parser.add_argument("--env", "-e", type=str, required=True)
parser.add_argument("--high-level-skip", "-k", type=int)
parser.add_argument("--timesteps", "-t", type=int, default=2000)
parser.add_argument("--delta-max", "-dm", type=float, default=None)

args = parser.parse_args()

high_params = ModelParams.load(args.high)
low_params = ModelParams.load(args.low)

print("LOADED PARAMS", high_params)
print("LOADED PARAMS", low_params)

high_params['env'] = args.env

high_params['env_args']['k'] = args.high_level_skip
low_params['env_args']['k'] = args.high_level_skip

if args.delta_max:
    high_params['env_args']['delta_max'] = args.delta_max
    low_params['env_args']['delta_max'] = args.delta_max

model, _ = load_hrl(args.low,