Beispiel #1
0
def train_and_save_policy(env_name, seed, save_period, total_timesteps, hidden_dim=100):
    save_dir = "trained_policy/%s/seed_%d_hidden_%d" % (env_name, seed, hidden_dim)
    os.makedirs(save_dir, exist_ok=True)
    if len(glob.glob("%s/step_*.pkl" % save_dir)) > 0:
        print("already trained: %s" % save_dir)
        return

    def callback(_locals, _globals):
        global n_steps
        model_filepath = "%s/step_%d.pkl" % (save_dir, n_steps + 1)

        if (n_steps + 1) % save_period == 0:
            print('Saving a model to %s' % model_filepath)
            model.save(model_filepath)

        n_steps += 1
        return True

    global n_steps
    n_steps = 0

    env = gym.make(precise_env_name(env_name))
    env.seed(seed)
    set_global_seeds(seed)
    model = SAC(env, ent_coef='auto', seed=seed, hidden_dim=hidden_dim)

    model.learn(total_timesteps=total_timesteps, log_interval=10, seed=seed, callback=callback)
Beispiel #2
0
def load_trained_agent(env_name, trained_policy_seed, trained_policy_step, bias_offset=0, seed=0, hidden_dim=64):
    env = gym.make(precise_env_name(env_name))
    trained_agent = SAC.load("trained_policy/%s/seed_%d_hidden_%d/step_%d.pkl" % (env_name, trained_policy_seed, hidden_dim, trained_policy_step), env, seed=seed, hidden_dim=hidden_dim)
    parameters = trained_agent.get_parameters()
    for i, parameter in enumerate(parameters):
        name, value = parameter
        if name == 'actor/f2_log_std/bias:0':
            parameters[i] = (name, value + bias_offset)
    trained_agent.load_parameters(parameters)
    return trained_agent
Beispiel #3
0
def make_vectorized_env(env_name, n_envs=multiprocessing.cpu_count()):
    def make_env(env_id, seed=0):
        def _init():
            env = gym.make(env_id)
            env.seed(seed)
            return env

        set_global_seeds(seed)
        return _init

    vec_env = SubprocVecEnv([make_env(precise_env_name(env_name), i) for i in range(n_envs)])
    return vec_env
Beispiel #4
0
def generate_cluster(env_name,
                     trained_policy_seed,
                     trained_policy_step,
                     num_episodes,
                     num_clusters,
                     seed,
                     hidden_dim=64):
    """
    :return: cluster infos
    - list of [(obs, action, reward, next_obs, done), ... ]
    - len(trajectory): num_episodes
    - len(trajectory[0]): time steps of 0th episode
    """
    save_dir = "batch_trajectory/{}/seed_{}_hidden_{}/step_{}".format(
        env_name, trained_policy_seed, hidden_dim, trained_policy_step)
    os.makedirs(save_dir, exist_ok=True)

    trajectory_filepath = '%s/episode_%d_seed_%d.npy' % (save_dir,
                                                         num_episodes, seed)
    cluster_filepath = '%s/episode_%d_seed_%d_clusters_%d.npy' % (
        save_dir, num_episodes, seed, num_clusters)
    if os.path.exists(cluster_filepath):
        cluster_result = np.load(cluster_filepath, allow_pickle=True)[()]
        print('Clusters has already been generated: %s...' % cluster_filepath)
    else:
        print('%s not exists... generate clusters...' % cluster_filepath)
        env = gym.make(precise_env_name(env_name))
        env.seed(seed)
        set_global_seeds(seed)

        trajectory = np.load(trajectory_filepath, allow_pickle=True)
        obs = []
        for traj in trajectory:
            for (o, a, r, no, d) in traj:
                obs.append(o)
        obs_mean = np.mean(obs, axis=0, keepdims=True)
        obs_std = np.std(obs, axis=0, keepdims=True) + 1e-3
        stan_obs = (obs - obs_mean) / obs_std
        np.random.shuffle(stan_obs)

        import time
        startime = time.time()
        covertree = CoverTree(stan_obs[:10000],
                              scipy.spatial.distance.euclidean,
                              leafsize=10)
        print('used_time: {}'.format(time.time() - startime))
        print(covertree.root.ctr_idx)
        current_parents = [covertree.root]
        next_parents = []
        representatives = set([])
        candidates = []
        while len(representatives) < num_clusters:
            if not candidates:
                for child in current_parents[0].children:
                    if isinstance(child, CoverTree._LeafNode):
                        candidates.append(child)
                    else:
                        current_parents.append(child)
                representatives.add(current_parents.pop(0).ctr_idx)
            else:
                representatives.add(candidates.pop().ctr_idx)
        print(representatives)
        cluster_result = {'representatives': stan_obs[list(representatives)]}
        np.save(cluster_filepath, cluster_result)
    return cluster_result
Beispiel #5
0
def run(env_name, trained_policy_seed, trained_policy_step, trajectory_episode, trajectory_seed, alg, total_timesteps, seed, alg_params={}):

    hidden_dim = 64 if alg_params.get('hidden_dim') is None else alg_params['hidden_dim']

    env = gym.make(precise_env_name(env_name))
    state_dim, action_dim, max_action = env.observation_space.shape[0], env.action_space.shape[0], float(env.action_space.high[0])
    trained_agent = load_trained_agent(env_name, trained_policy_seed, trained_policy_step, hidden_dim=hidden_dim)
    parameters = trained_agent.get_parameters()

    # Load trajectory & train/valid split
    split_ratio = 0.8
    trajectory_all = generate_trajectory(env_name, trained_policy_seed, trained_policy_step, trajectory_episode, trajectory_seed, hidden_dim=hidden_dim)
    trajectory_train = trajectory_all[:int(len(trajectory_all) * split_ratio)]
    trajectory_valid = trajectory_all[int(len(trajectory_all) * split_ratio):]

    log_interval = 10000  # max(100, total_timesteps // 300)

    alg_name, trajectory = alg, trajectory_all
    batch_trajectory = None
    # Load model
    if alg == 'bc':
        model = BC(state_dim, action_dim, max_action, hidden_dim=hidden_dim)
    elif alg == 'vaebc':
        model = VAEBC(state_dim, action_dim, max_action, hidden_dim=hidden_dim)
    elif alg == 'klac':
        kl_coef, gradient_norm_panelty, gradient_norm_limit = alg_params['kl_coef'], alg_params['gradient_norm_panelty'], alg_params['gradient_norm_limit']
        alg_name = 'klac_klcoef_{}_grad_norm_panelty_{}_grad_norm_limit_{}'.format(kl_coef, gradient_norm_panelty, gradient_norm_limit)
        model = KLAC(state_dim, action_dim, max_action, kl_coef=kl_coef, gradient_norm_panelty=gradient_norm_panelty, gradient_norm_limit=gradient_norm_limit, hidden_dim=hidden_dim)
        trajectory = trajectory_train
    elif alg == 'bopah_single':
        kl_coef, gradient_norm_panelty, gradient_norm_limit = alg_params['kl_coef'], alg_params['gradient_norm_panelty'], alg_params['gradient_norm_limit']
        alg_name = 'bopah_single_klcoef_{}_grad_norm_panelty_{}_grad_norm_limit_{}'.format(kl_coef, gradient_norm_panelty, gradient_norm_limit)
        model = BOPAHSingle(state_dim, action_dim, max_action, kl_coef=kl_coef, gradient_norm_panelty=gradient_norm_panelty, gradient_norm_limit=gradient_norm_limit, hidden_dim=hidden_dim)
        trajectory = trajectory_train
        batch_trajectory = trajectory_valid
    elif alg == 'bopah':
        kl_coef, gradient_norm_panelty, gradient_norm_limit, dependent_limit, num_clusters \
            = alg_params['kl_coef'], alg_params['gradient_norm_panelty'], alg_params['gradient_norm_limit'], alg_params['dependent_limit'], alg_params['num_clusters']
        alg_name = 'bopah_klcoef_{}_grad_norm_panelty_{}_grad_norm_limit_{}_dependent_limit_{}'.format(kl_coef, gradient_norm_panelty, gradient_norm_limit, dependent_limit)
        if alg_params.get('total_loss'):
            alg_name += '_total_loss'
        cluster_info = generate_cluster(env_name, trained_policy_seed, trained_policy_step, trajectory_episode, num_clusters, trajectory_seed, hidden_dim=hidden_dim)
        model = BOPAH(trajectory_train, trajectory_valid, state_dim, action_dim, max_action, kl_coef=kl_coef, gradient_norm_panelty=gradient_norm_panelty, 
                            gradient_norm_limit=gradient_norm_limit, hidden_dim=hidden_dim, cluster_info=cluster_info, 
                            dependent_limit=dependent_limit, seed=seed, total_loss=alg_params.get('total_loss'))
        trajectory = trajectory_train
        batch_trajectory = trajectory_valid
    elif alg == 'bcq':
        alg_name += '_perturb_{}'.format(alg_params['perturb'])
        model = BCQ(state_dim, action_dim, max_action, trajectory=trajectory_all, hidden_dim=hidden_dim, perturb=alg_params['perturb'])
    elif alg == 'bear':
        alg_name += '_thres_{}'.format(alg_params['thres'])
        model = BEAR(state_dim, action_dim, max_action, hidden_dim=hidden_dim, threshold=alg_params['thres'])
    else:
        raise NotImplementedError()

    # Set result path
    result_dir = "eval_results/%s/seed_%d/step_%d/trajectory_%d/seed_%d_hidden_%d/%s" % (env_name, trained_policy_seed, trained_policy_step, trajectory_episode, trajectory_seed, hidden_dim, alg_name)
    os.makedirs(result_dir, exist_ok=True)
    result_filepath = "%s/seed_%d.npy" % (result_dir, seed)
    if os.path.exists(result_filepath):
        print('Result file already exists: %s' % result_filepath)
        return np.load(result_filepath, allow_pickle=True)[()]

    # Run algorithm and save the result
    print('==============================================')
    print('Run: ', result_filepath)
    vec_env = make_vectorized_env(env_name)  # for policy evaluation
    eval_timesteps, evals, info_values = model.batch_learn(trajectory, vec_env, total_timesteps=total_timesteps, log_interval=log_interval, seed=seed,
                                                           result_filepath=result_filepath, valid_trajectory=batch_trajectory)
    result = {'eval_timesteps': eval_timesteps, 'evals': evals, 'info_values': info_values}
    np.save(result_filepath, result)
    os.remove(result_filepath + '.tmp.npy')

    return result
def generate_trajectory(env_name,
                        trained_policy_seed,
                        trained_policy_step,
                        num_episodes,
                        seed,
                        hidden_dim=64):
    """
    :return: trajectory
    - list of [(obs, action, reward, next_obs, done), ... ]
    - len(trajectory): num_episodes
    - len(trajectory[0]): time steps of 0th episode
    """
    save_dir = "batch_trajectory/{}/seed_{}_hidden_{}/step_{}".format(
        env_name, trained_policy_seed, hidden_dim, trained_policy_step)
    os.makedirs(save_dir, exist_ok=True)

    trajectory_filepath = '%s/episode_%d_seed_%d.npy' % (save_dir,
                                                         num_episodes, seed)
    if os.path.exists(trajectory_filepath):
        trajectory_result = np.load(trajectory_filepath, allow_pickle=True)
        print('Trajectory has already been generated: %s...' %
              trajectory_filepath)
    else:
        print('%s not exists... generate trajectories...' %
              trajectory_filepath)
        env = gym.make(precise_env_name(env_name))
        env.seed(seed)
        set_global_seeds(seed)
        if trained_policy_seed != 'uniform':
            trained_agent = load_trained_agent(env_name,
                                               trained_policy_seed,
                                               trained_policy_step,
                                               seed=seed,
                                               hidden_dim=hidden_dim)

        trajectory_result = []
        for episode in tqdm(range(num_episodes),
                            desc='generate_trajectory',
                            ncols=70):
            obs = env.reset()
            trajectory_one = []
            for t in range(10000):
                if trained_policy_seed != 'uniform':
                    action, _ = trained_agent.predict(obs, deterministic=False)
                else:
                    action = env.action_space.sample()
                next_obs, reward, done, info = env.step(action)

                terminal = done
                if info.get('TimeLimit.truncated'):
                    terminal = False
                trajectory_one.append(
                    (obs, action, reward, next_obs, terminal))
                if done:
                    break
                obs = next_obs
            trajectory_result.append(trajectory_one)
        trajectory_result = np.array(trajectory_result)
        np.save(trajectory_filepath, trajectory_result)

    return trajectory_result