コード例 #1
0
def launch_job(tag, variant):
    seed, env, algo, optim, curv_type, lr, batch_size, cg_iters, cg_residual_tol, cg_prev_init_coef, \
        cg_precondition_empirical, cg_precondition_regu_coef, cg_precondition_exp,  \
        shrinkage_method, lanczos_amortization, lanczos_iters, approx_adaptive, betas, use_nn_policy, total_samples = variant
    beta1, beta2 = betas

    iters = int(total_samples / batch_size)
    save_dir = build_log_dir(tag, variant)

    print("Save: ", save_dir)
    save_dir = \
"/Users/trevorbarron/Documents/dev.nosync/thesis/adacurv/experiments/mjrl/results/final/best_runs/\
results/pybullet_sample_mode_bball_random_hoop/BasketballEnvRandomHoop-v0/trpo/natural_adam/optim_adaptive/\
curv_type_fisher/cg_iters_10/cg_residual_tol_1e-10/cg_prev_init_coef_0.5/cg_precondition_empirical_true/\
cg_precondition_regu_coef_0.001/cg_precondition_exp_0.75/shrunk_true/cg/nn_policy/adam_vfn_opt/\
total_samples_2000000/batch_size_2000/lr_0.01/betas0.9_0.9/1/iterations/"

    # /Users/trevorbarron/Documents/dev.nosync/thesis/adacurv/experiments/mjrl/results/results_serv_tmp/"
    policy_path = os.path.join(save_dir, 'best_policy.pickle')
    # policy_path = os.path.join(save_dir, 'iterations/best_policy.pickle')
    with open(policy_path, 'rb') as f:
        policy = pickle.load(f)
    print(policy)

    e = GymEnv('BasketballEnvRandomHoopRendered-v0')
    e.reset()
    input("Continue?")

    N = 100
    T = 250
    paths = base_sampler.do_rollout(N, policy, T, e, None)
    for p in paths:
        print(p['rewards'].sum())
コード例 #2
0
def policy_rollout(
    num_traj,
    env,
    policy,
    fitted_model,
    init_state=None,
    eval_mode=False,
    horizon=1e6,
    env_kwargs=None,
    seed=None,
):

    # get the correct env behavior
    if type(env) == str:
        env = GymEnv(env)
    elif isinstance(env, GymEnv):
        env = env
    elif callable(env):
        env = env(**env_kwargs)
    else:
        print("Unsupported environment format")
        raise AttributeError

    if seed is not None:
        env.set_seed(seed)
        torch.manual_seed(seed)

    # get initial states
    if init_state is None:
        st = np.array([env.reset() for _ in range(num_traj)])
        st = torch.from_numpy(st).float()
    elif type(init_state) == np.ndarray:
        st = torch.from_numpy(init_state).float()
    elif type(init_state) == list:
        st = torch.from_numpy(np.array(init_state)).float()
    else:
        print("Unsupported format for init state")
        quit()

    # perform batched rollouts
    horizon = min(horizon, env.horizon)
    obs = []
    act = []
    for t in range(horizon):
        at = policy.model.forward(st)
        if eval_mode is not True:
            at = at + torch.randn(at.shape) * torch.exp(policy.log_std)
        stp1 = fitted_model.forward(st, at)
        obs.append(st.to('cpu').data.numpy())
        act.append(at.to('cpu').data.numpy())
        st = stp1

    obs = np.array(obs)
    obs = np.swapaxes(obs, 0, 1)  # (num_traj, horizon, state_dim)
    act = np.array(act)
    act = np.swapaxes(act, 0, 1)  # (num_traj, horizon, action_dim)
    paths = dict(observations=obs, actions=act)

    return paths
コード例 #3
0
ファイル: mbac.py プロジェクト: zivzone/mjrl
    def __init__(
        self,
        env_name,
        policy,
        expert_paths=None,  # for the initial seeding
        epochs=5,
        batch_size=64,
        lr=1e-3,
        optimizer=None,
        loss_type='MSE',  # can be 'MLE' or 'MSE'
        seed=123,
        buffer_size=50,  # measured in number of trajectories
        mpc_params=None,
        save_logs=True,
    ):

        super().__init__(
            expert_paths=expert_paths,
            policy=policy,
            epochs=epochs,
            batch_size=batch_size,
            lr=lr,
            optimizer=optimizer,
            loss_type=loss_type,
            save_logs=save_logs,
        )
        self.expert_paths = [] if self.expert_paths is None else self.expert_paths
        self.buffer_size = buffer_size

        # For the MPC policy
        self.env = GymEnv(env_name)
        self.env.reset(seed=seed)
        if mpc_params is None:
            mean = np.zeros(self.env.action_dim)
            sigma = 1.0 * np.ones(self.env.action_dim)
            filter_coefs = [sigma, 0.05, 0.0, 0.0]
            mpc_params = dict(env=GymEnv(env_name),
                              H=10,
                              paths_per_cpu=25,
                              num_cpu=1,
                              kappa=10.0,
                              gamma=1.0,
                              mean=mean,
                              filter_coefs=filter_coefs,
                              seed=seed)
        else:
            mpc_params['env'] = GymEnv(env_name)
            mpc_params['seed'] = seed

        self.mpc_params = mpc_params
        self.mpc_policy = MPCActor(**mpc_params)
コード例 #4
0
ファイル: sampling.py プロジェクト: sashalambert/mjrl
def sample_paths(
    num_traj,
    env,
    policy,  # mpc policy on fitted model
    horizon=1e6,
    eval_mode=True,
    base_seed=None,
    noise_level=0.1,
):

    # get the correct env behavior
    if type(env) == str:
        env = GymEnv(env)
    elif isinstance(env, GymEnv):
        env = env
    elif callable(env):
        env = env()
    else:
        print("Unsupported environment format")
        raise AttributeError
    if base_seed is not None:
        env.set_seed(base_seed)
    horizon = min(horizon, env.horizon)
    paths = []
    for ep in range(num_traj):
        env.reset()
        observations = []
        actions = []
        rewards = []
        env_infos = []
        t = 0
        done = False
        while t < horizon and done is False:
            obs = env.get_obs()
            ifo = env.get_env_infos()
            act = policy.get_action(obs)
            if eval_mode is False and type(act) != list:
                act = act + np.random.uniform(
                    low=-noise_level, high=noise_level, size=act.shape[0])
            if type(act) == list:
                act = act[0] if eval_mode is False else act[1]['evaluation']
            next_obs, reward, done, _ = env.step(act)
            t = t + 1
            observations.append(obs)
            actions.append(act)
            rewards.append(reward)
            env_infos.append(ifo)
        path = dict(observations=np.array(observations),
                    actions=np.array(actions),
                    rewards=np.array(rewards),
                    terminated=done,
                    env_infos=tensor_utils.stack_tensor_dict_list(env_infos))
        paths.append(path)
    return paths
コード例 #5
0
def main():
    # get args
    args = get_args()

    # load env
    if args.include is not "":
        exec("import " + args.include)
    e = GymEnv(args.env_name)

    # load policy
    policy = args.policy
    if args.policy == "":
        pol = MLP(e.spec, init_log_std=-1)
        mode = "exploration"
    else:
        pol = pickle.load(open(policy, 'rb'))
        mode = "evaluation"

    # Visualized policy
    if args.render == "onscreen":
        # On screen
        e.env.env.visualize_policy(pol,
                                   horizon=e.horizon,
                                   num_episodes=args.num_episodes,
                                   mode=mode)
    else:
        # Offscreen buffer
        e.env.env.visualize_policy_offscreen(pol,
                                             horizon=100,
                                             num_episodes=args.num_episodes,
                                             mode=mode,
                                             filename=args.filename)

    # Close envs
    e.env.env.close_env()
コード例 #6
0
ファイル: visualize_demos.py プロジェクト: yangdeai/hand_dapg
def demo_playback(env_name, demo_paths):
    e = GymEnv(env_name)
    e.reset()
    for path in demo_paths:
        e.set_env_state(path['init_state_dict'])
        actions = path['actions']
        for t in range(actions.shape[0]):
            e.step(actions[t])
            e.env.mj_render()
コード例 #7
0
ファイル: hand_dapg_random.py プロジェクト: mihdalal/d4rl
def pol_playback(env_name, num_trajs=100):
    e = GymEnv(env_name)
    e.reset()

    obs_ = []
    act_ = []
    rew_ = []
    term_ = []
    info_qpos_ = []
    info_qvel_ = []

    ravg = []

    for n in range(num_trajs):
        e.reset()
        returns = 0
        for t in range(e._horizon):
            obs = e.get_obs()
            obs_.append(obs)
            info_qpos_.append(e.env.data.qpos.ravel().copy())
            info_qvel_.append(e.env.data.qvel.ravel().copy())
            action = e.action_space.sample()
            act_.append(action)

            _, rew, _, info = e.step(action)
            returns += rew
            rew_.append(rew)

            done = False
            if t == (e._horizon - 1):
                done = True
            term_.append(done)

            # e.env.mj_render() # this is much faster
            # e.render()
        ravg.append(returns)

    # write out hdf5 file
    obs_ = np.array(obs_).astype(np.float32)
    act_ = np.array(act_).astype(np.float32)
    rew_ = np.array(rew_).astype(np.float32)
    term_ = np.array(term_).astype(np.bool_)
    info_qpos_ = np.array(info_qpos_).astype(np.float32)
    info_qvel_ = np.array(info_qvel_).astype(np.float32)

    dataset = h5py.File("%s_random.hdf5" % env_name, "w")

    # dataset.create_dataset('observations', obs_.shape, dtype='f4')
    dataset.create_dataset("observations", data=obs_, compression="gzip")
    dataset.create_dataset("actions", data=act_, compression="gzip")
    dataset.create_dataset("rewards", data=rew_, compression="gzip")
    dataset.create_dataset("terminals", data=term_, compression="gzip")
    dataset.create_dataset("infos/qpos", data=info_qpos_, compression="gzip")
    dataset.create_dataset("infos/qvel", data=info_qvel_, compression="gzip")
コード例 #8
0
def main(env_name, mode, path, iteration, job_name, horizon, run_no):
    env_kwargs = {}
    if path and ('.pickle' in path or 'pkl' in path):
        policy_path = path
    else:
        if job_name:
            path = os.path.join('../inverse_rl_dexterous_hand/training/Runs/',
                                job_name, 'run_' + str(run_no), 'iterations')
        if iteration:
            if iteration == 'last':
                checkpoint_file = get_last_iteration_checkpoint(path)
            else:
                checkpoint_file = "checkpoint_{}.pickle".format(iteration)
            policy_path = os.path.join(path, checkpoint_file)
        else:
            policy_path = os.path.join(path, "best_policy.pickle")
    if env_name is None:
        cfg_path = os.path.join(os.path.dirname(policy_path), "../..", "..",
                                "config.yaml")
        if not os.path.exists(cfg_path):
            cfg_path = os.path.join(os.path.dirname(cfg_path), "../..",
                                    "config.yaml")
        if not os.path.exists(cfg_path):
            cfg_path = None
        if cfg_path is not None:
            cfg = yamlreader.yaml_load(cfg_path)
            env_name = cfg['env']
            env_kwargs = cfg['env_kwargs']
        else:
            print(
                "Config file not found, cannot infer environment name. Please provide env_name parameter."
            )
            exit(1)
    e = GymEnv(env_name, **env_kwargs)
    print("Checkpoint path:", policy_path)
    policy = pickle.load(open(policy_path, 'rb'))
    if isinstance(policy, list):
        policy = policy[0]
    # render policy
    if horizon is None:
        horizon = e.horizon
    e.visualize_policy(policy, num_episodes=100, horizon=horizon, mode=mode)
コード例 #9
0
def main(env_name, policy, mode, seed, episodes):
    e = GymEnv(env_name)
    e.set_seed(seed)
    if policy is not None:
        pi = pickle.load(open(policy, 'rb'))
    else:
        pi = MLP(e.spec, hidden_sizes=(32, 32), seed=seed, init_log_std=-1.0)
    # render policy
    e.visualize_policy(pi, num_episodes=episodes, horizon=e.horizon, mode=mode)
コード例 #10
0
def main(file, seed, noise_level, num_episodes, config, device_path):
    exp_data = pickle.load(open(file, 'rb'))
    policy = exp_data['policy']
    model = exp_data['fitted_model']
    model = model[-1] if type(model) == list else model
    env_id = policy.env.env_id
    render = True

    # TODO(Aravind): Map to hardware if device_path is specified

    env = GymEnv(env_id)
    policy.env = env

    env.set_seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    if config is not None:
        try:
            with open(config, 'r') as f:
                config = eval(f.read())
        except:
            with open(config, 'r') as f:
                config = json.load(f)
        policy.plan_horizon = config['plan_horizon']
        policy.num_traj = config['plan_paths']
        policy.kappa = config['kappa']
        policy.filter_coefs = [
            config['filter_coefs'][k] for k in ['f1', 'f2', 'f3', 'f4']
        ]
        policy.omega = config['omega'] if 'omega' in config.keys() else 0.0

    # TODO(Aravind): Implement capability to set predicted state for rendering purposes
    # evaluate_policy(env, policy, model, noise_level, real_step=False, num_episodes=num_episodes, visualize=render)
    evaluate_policy(env,
                    policy,
                    model,
                    noise_level,
                    real_step=True,
                    num_episodes=num_episodes,
                    visualize=render)

    # final close out
    env.reset()
コード例 #11
0
ファイル: hand_dapg_random.py プロジェクト: mihdalal/d4rl
def main(env_name, num_trajs):
    e = GymEnv(env_name)
    # render policy
    pol_playback(env_name, num_trajs)
コード例 #12
0
def single_process(job):
    job_start_time = timer.time()

    # Allow process to parallelize things internally
    curr_proc = mp.current_process()
    curr_proc.daemon = False

    os.chdir(cwd)
    dirpath = os.path.join(job['save_dir'], job['job_name'])
    os.makedirs(dirpath, exist_ok=True)

    # start job
    os.chdir(cwd)
    job_start_time = timer.time()
    print('Started New Job : ', job['job_name'], '=======================')
    print('Job specifications : \n', job)

    # Make Env
    e = GymEnv(job['env_name'])

    # Make baseline
    baseline = MLPBaseline(e.spec)

    # save job details
    job['horizon'] = e.horizon
    job['ctrl_timestep'] = e.env.env.dt
    job['sim_timestep'] = e.env.env.model.opt.timestep
    # job['sim_skip'] = e.env.env.skip
    job_data_file = open(dirpath + '/job_data.txt', 'w')
    pprint.pprint(job, stream=job_data_file)

    job_data_file.close()

    # Make policy (???vik: sizes are hard coded)
    if 'init_policy' in job:
        policy = MLP(e.spec,
                     init_log_std=job['init_std'],
                     hidden_sizes=(32, 32),
                     seed=job['seed'])
        loaded_policy = pickle.load(open(job['init_policy'], 'rb'))
        loaded_params = loaded_policy.get_param_values()
        print('log std values in loaded policy = ')
        print(params[-policy.m:])
        # NOTE: if the log std is too small
        # (say <-2.0, it is problem dependent and intuition should be used)
        # then we need to bump it up so that it explores
        # params[-policy.m:] += 1.0
        policy.set_param_values(loaded_params)
        del job['init_policy']

    else:
        policy = MLP(e.spec,
                     init_log_std=job['init_std'],
                     hidden_sizes=(32, 32),
                     seed=job['seed'])
    # Agent
    agent = NPG(e, policy, baseline, seed=job['seed'], \
        normalized_step_size=job['normalized_step_size'], \
        save_logs=job['save_logs'], FIM_invert_args=job['FIM_invert_args'])

    # Train Agent
    train_agent(
        job_name=dirpath,
        agent=agent,
        seed=job['seed'],
        niter=job['niter'],
        gamma=job['gamma'],
        gae_lambda=job['gae_lambda'],
        num_cpu=job['num_cpu'],
        sample_mode=job['sample_mode'],
        num_traj=job['num_traj'],
        evaluation_rollouts=job['evaluation_rollouts'],
        save_freq=job['save_freq'],
        plot_keys={'stoc_pol_mean', 'stoc_pol_std'},
    )

    total_job_time = timer.time() - job_start_time
    print('Job', job['job_name'],
          'took %f seconds ==============' % total_job_time)
    return total_job_time
コード例 #13
0
def get_environment(env_name=None, **kwargs):
    if env_name is None: print("Need to specify environment name")
    e = GymEnv(env_name)
    # can make procedural modifications here if needed using kwargs
    return e
コード例 #14
0
def get_environment(env_name=None):
    if env_name is None: print("Need to specify environment name")
    return GymEnv(env_name)
コード例 #15
0
ファイル: train_agent.py プロジェクト: Divye02/mjrl-1
def train_agent(
    job_name,
    agent,
    seed=0,
    niter=101,
    gamma=0.995,
    gae_lambda=None,
    num_cpu=1,
    sample_mode='trajectories',
    num_traj=50,
    num_samples=50000,  # has precedence, used with sample_mode = 'samples'
    save_freq=10,
    evaluation_rollouts=None,
    plot_keys=['stoc_pol_mean'],
):

    np.random.seed(seed)
    if os.path.isdir(job_name) == False:
        os.mkdir(job_name)
    previous_dir = os.getcwd()
    os.chdir(job_name)  # important! we are now in the directory to save data
    if os.path.isdir('iterations') == False: os.mkdir('iterations')
    if os.path.isdir('logs') == False and agent.save_logs == True:
        os.mkdir('logs')
    best_policy = copy.deepcopy(agent.policy)
    best_perf = -1e8
    train_curve = best_perf * np.ones(niter)
    mean_pol_perf = 0.0
    e = GymEnv(agent.env.env_id)

    for i in range(niter):
        print(
            "......................................................................................"
        )
        print("ITERATION : %i " % i)
        if train_curve[i - 1] > best_perf:
            best_policy = copy.deepcopy(agent.policy)
            best_perf = train_curve[i - 1]
        N = num_traj if sample_mode == 'trajectories' else num_samples
        args = dict(N=N,
                    sample_mode=sample_mode,
                    gamma=gamma,
                    gae_lambda=gae_lambda,
                    num_cpu=num_cpu)
        stats = agent.train_step(**args)
        train_curve[i] = stats[0]
        if evaluation_rollouts is not None and evaluation_rollouts > 0:
            print("Performing evaluation rollouts ........")
            eval_paths = sample_paths_parallel(N=evaluation_rollouts,
                                               policy=agent.policy,
                                               num_cpu=num_cpu,
                                               env_name=e.env_id,
                                               mode='evaluation',
                                               pegasus_seed=seed)
            mean_pol_perf = np.mean(
                [np.sum(path['rewards']) for path in eval_paths])
            if agent.save_logs:
                agent.logger.log_kv('eval_score', mean_pol_perf)
        if i % save_freq == 0 and i > 0:
            if agent.save_logs:
                agent.logger.save_log('logs/')
                make_train_plots(log=agent.logger.log,
                                 keys=plot_keys,
                                 save_loc='logs/')
            policy_file = 'policy_%i.pickle' % i
            baseline_file = 'baseline_%i.pickle' % i
            pickle.dump(agent.policy, open('iterations/' + policy_file, 'wb'))
            pickle.dump(agent.baseline,
                        open('iterations/' + baseline_file, 'wb'))
            pickle.dump(best_policy, open('iterations/best_policy.pickle',
                                          'wb'))
        # print results to console
        if i == 0:
            result_file = open('results.txt', 'w')
            print("Iter | Stoc Pol | Mean Pol | Best (Stoc) \n")
            result_file.write(
                "Iter | Sampling Pol | Evaluation Pol | Best (Sampled) \n")
            result_file.close()
        print("[ %s ] %4i %5.2f %5.2f %5.2f " %
              (timer.asctime(timer.localtime(
                  timer.time())), i, train_curve[i], mean_pol_perf, best_perf))
        result_file = open('results.txt', 'a')
        result_file.write("%4i %5.2f %5.2f %5.2f \n" %
                          (i, train_curve[i], mean_pol_perf, best_perf))
        result_file.close()
        if agent.save_logs:
            print_data = sorted(
                filter(lambda v: np.asarray(v[1]).size == 1,
                       agent.logger.get_current_log().items()))
            print(tabulate(print_data))

    # final save
    pickle.dump(best_policy, open('iterations/best_policy.pickle', 'wb'))
    if agent.save_logs:
        agent.logger.save_log('logs/')
        make_train_plots(log=agent.logger.log,
                         keys=plot_keys,
                         save_loc='logs/')
    os.chdir(previous_dir)
コード例 #16
0
def main(env_name, mode, num_trajs, clip=True):
    e = GymEnv(env_name)
    policy = "./policies/" + env_name + ".pickle"
    pi = pickle.load(open(policy, "rb"))
    # render policy
    pol_playback(env_name, pi, num_trajs, clip=clip)
コード例 #17
0
env_ids = pickle.load(f)
f.close()
e_unshuffled = {}
for task_id in range(num_tasks):
    size_factors = size_factors_list[task_id]
    env_id = env_ids[task_id]
    gym.envs.register(
        id=env_id,
        entry_point=
        'gym_extensions.continuous.mujoco.modified_half_cheetah:HalfCheetahModifiedBodyPartSizeEnv',
        max_episode_steps=1000,
        reward_threshold=3800.0,
        kwargs=dict(body_parts=['torso', 'fthigh', 'fshin', 'ffoot'],
                    size_scales=size_factors))
    e_unshuffled[task_id] = GymEnv(
        env_id
    )  # only do the environment here, so different files can create the same tasks

for i in range(num_seeds):
    np.random.seed(SEED)
    torch.manual_seed(SEED)

    job_name_lpgftw_seed = job_name_lpgftw + '/seed_{}'.format(i)

    f = open(job_name_lpgftw_seed + '/task_order.pickle', 'rb')
    task_order = pickle.load(f)
    f.close()
    e = {}
    for task_id in range(num_tasks):
        e[task_id] = e_unshuffled[task_order[task_id]]
コード例 #18
0
def main(env_name, snapshot_file, mode, num_trajs, clip=True):
    e = GymEnv(env_name)
    pi = pickle.load(gzip.open(snapshot_file, 'rb'))
    import pdb
    pdb.set_trace()
    pass
コード例 #19
0
if job_data['sample_mode'] == 'trajectories':
    assert 'rl_num_traj' in job_data.keys()
    job_data['rl_num_samples'] = 0  # will be ignored
elif job_data['sample_mode'] == 'samples':
    assert 'rl_num_samples' in job_data.keys()
    job_data['rl_num_traj'] = 0  # will be ignored
else:
    print("Unknown sampling mode. Choose either trajectories or samples")
    exit()

# ===============================================================================
# Train Loop
# ===============================================================================

e = GymEnv(job_data['env'])
policy = MLP(e.spec,
             hidden_sizes=job_data['policy_size'],
             seed=job_data['seed'],
             init_log_std=job_data['init_log_std'])
baseline = MLPBaseline(e.spec,
                       reg_coef=1e-3,
                       batch_size=job_data['vf_batch_size'],
                       hidden_sizes=job_data['vf_hidden_size'],
                       epochs=job_data['vf_epochs'],
                       learn_rate=job_data['vf_learn_rate'])

# Construct the algorithm
if job_data['algorithm'] == 'NPG':
    # Other hyperparameters (like number of CG steps) can be specified in config for pass through
    # or default hyperparameters will be used
コード例 #20
0
ファイル: hopper_stl.py プロジェクト: Lifelong-ML/LPG-FTW
from mjrl.utils.gym_env import GymEnv
from mjrl.policies.gaussian_linear import LinearPolicy
from mjrl.baselines.mlp_baseline import MLPBaseline
from mjrl.algos.npg_cg import NPG
from mjrl.utils.train_agent import train_agent
import time as timer

SEED = 500

e = GymEnv('Hopper-v2')
policy = LinearPolicy(e.spec, seed=SEED)
baseline = MLPBaseline(e.spec,
                       reg_coef=1e-3,
                       batch_size=64,
                       epochs=10,
                       learn_rate=1e-4)
agent = NPG(e,
            policy,
            baseline,
            normalized_step_size=0.1,
            seed=SEED,
            save_logs=True)

ts = timer.time()
train_agent(job_name='hopper_nominal',
            agent=agent,
            seed=SEED,
            niter=500,
            gamma=0.995,
            gae_lambda=0.97,
            num_cpu=4,
コード例 #21
0
ファイル: sampling.py プロジェクト: sashalambert/mjrl
def policy_rollout(
        num_traj,
        env,
        policy,
        learned_model,
        init_state=None,
        eval_mode=False,
        horizon=1e6,
        env_kwargs=None,
        seed=None,
        s_min=None,
        s_max=None,
        a_min=None,
        a_max=None,
        large_value=float(1e2),
):

    # Only CPU rollouts are currently supported.
    # TODO(Aravind) : Extend GPU support

    # get the correct env behavior
    if type(env) == str:
        env = GymEnv(env)
    elif isinstance(env, GymEnv):
        env = env
    elif callable(env):
        env = env(**env_kwargs)
    else:
        print("Unsupported environment format")
        raise AttributeError

    if seed is not None:
        env.set_seed(seed)
        torch.manual_seed(seed)

    # get initial states
    if init_state is None:
        st = np.array([env.reset() for _ in range(num_traj)])
        st = torch.from_numpy(st).float()
    elif type(init_state) == np.ndarray:
        st = torch.from_numpy(init_state).float()
    elif type(init_state) == list:
        st = torch.from_numpy(np.array(init_state)).float()
    elif type(init_state) == torch.Tensor:
        assert init_state.device == 'cpu'
        pass
    else:
        print("Unsupported format for init state")
        quit()

    # perform batched rollouts
    horizon = min(horizon, env.horizon)
    obs = []
    act = []
    for t in range(horizon):
        at = policy.model.forward(st)
        if eval_mode is not True:
            at = at + torch.randn(at.shape) * torch.exp(policy.log_std)
        # clamp states and actions to avoid blowup
        at = enforce_tensor_bounds(at, a_min, a_max, large_value)
        stp1 = learned_model.forward(st, at)
        stp1 = enforce_tensor_bounds(stp1, s_min, s_max, large_value)
        obs.append(st.to('cpu').data.numpy())
        act.append(at.to('cpu').data.numpy())
        st = stp1

    obs = np.array(obs)
    obs = np.swapaxes(obs, 0, 1)  # (num_traj, horizon, state_dim)
    act = np.array(act)
    act = np.swapaxes(act, 0, 1)  # (num_traj, horizon, action_dim)
    paths = dict(observations=obs, actions=act)

    return paths
コード例 #22
0
ファイル: dapg.py プロジェクト: yuchen8807/hand_dapg
import time as timer
import pickle

SEED = 100

# ------------------------------
# Get demonstrations
print("========================================")
print("Collecting expert demonstrations")
print("========================================")
demo_paths = pickle.load(
    open('../demonstrations/relocate-v0_demos.pickle', 'rb'))

# ------------------------------
# Train BC
e = GymEnv('relocate-v0')
policy = MLP(e.spec, hidden_sizes=(32, 32), seed=SEED)
bc_agent = BC(demo_paths, policy=policy, epochs=5, batch_size=32, lr=1e-3)

ts = timer.time()
print("========================================")
print("Running BC with expert demonstrations")
print("========================================")
bc_agent.train()
print("========================================")
print("BC training complete !!!")
print("time taken = %f" % (timer.time() - ts))
print("========================================")

score = e.evaluate_policy(policy, num_episodes=10, mean_action=True)
print("Score with behavior cloning = %f" % score[0][0])
コード例 #23
0
def single_process(job):
    job_start_time = timer.time()

    # Allow process to parallelize things internally
    curr_proc = mp.current_process()
    curr_proc.daemon = False

    # Create a directory for the job results.
    job_dir = os.path.join(job['output_dir'])
    if not os.path.isdir(job_dir):
        os.mkdir(job_dir)

    # start job
    job_start_time = timer.time()
    print('Started New Job : ', job['job_name'], '=======================')
    print('Job specifications : \n', job)

    # Make Env
    env_name = job['env_name']
    # adept_envs.global_config.set_config(env_name, {
    #     'robot_params': job['robot'],
    #     **job.get('env_params', {}),
    # })
    e = GymEnv(env_name)

    # Make baseline
    baseline = MLPBaseline(e.spec)

    # save job details
    job['horizon'] = e.horizon
    job['ctrl_timestep'] = e.env.env.dt
    job['sim_timestep'] = e.env.env.model.opt.timestep
    # job['sim_skip'] = e.env.env.skip

    with open(os.path.join(job_dir, 'job_data.txt'), 'w') as job_data_file:
        pprint.pprint(job, stream=job_data_file)

    if 'init_policy' in job:
        policy = MLP(e.spec, init_log_std=job['init_std'], hidden_sizes=(32,32), seed=job['seed'])
        loaded_policy = pickle.load(open(job['init_policy'], 'rb'))
        loaded_params = loaded_policy.get_param_values()
        print("log std values in loaded policy = ")
        print(loaded_params[-policy.m:])
        # NOTE: if the log std is too small 
        # (say <-2.0, it is problem dependent and intuition should be used)
        # then we need to bump it up so that it explores
        loaded_params[-policy.m:] += job['init_std']
        policy.set_param_values(loaded_params)
        del job['init_policy']

    else:
        policy = MLP(
            e.spec,
            init_log_std=job['init_std'],
            hidden_sizes=job['hidden_sizes'],
            # hidden_sizes=(32, 32),
            seed=job['seed'])

    # Agent
    agent = NPG(
        e,
        policy,
        baseline,
        seed=job['seed'],
        normalized_step_size=job['normalized_step_size'],
        save_logs=job['save_logs'],
        FIM_invert_args=job['FIM_invert_args'])

    # Train Agent
    train_agent(
        job_name=job['job_name'],
        agent=agent,
        # save_dir=job_dir,
        seed=job['seed'],
        niter=job['niter'],
        gamma=job['gamma'],
        gae_lambda=job['gae_lambda'],
        num_cpu=job['num_cpu'],
        sample_mode=job['sample_mode'],
        num_traj=job.get('num_traj'),
        num_samples=job.get('num_samples'),
        evaluation_rollouts=job['evaluation_rollouts'],
        save_freq=job['save_freq'],
        plot_keys={'stoc_pol_mean', 'stoc_pol_std'},
    )

    total_job_time = timer.time() - job_start_time
    print('Job', job['job_name'],
          'took %f seconds ==============' % total_job_time)
    return total_job_time
コード例 #24
0
def pol_playback(env_name, num_trajs=100):
    e = GymEnv(env_name)
    e.reset()

    obs_ = []
    act_ = []
    rew_ = []
    term_ = []
    timeout_ = []
    info_qpos_ = []
    info_qvel_ = []
    info_env_state_ = []

    ravg = []
    
    for n in range(num_trajs):
        e.reset()
        returns = 0
        for t in range(e._horizon):
            obs = e.get_obs()
            obs_.append(obs)
            info_qpos_.append(e.env.data.qpos.ravel().copy())
            info_qvel_.append(e.env.data.qvel.ravel().copy())
            info_env_state_.append(e.get_env_state())
            action = e.action_space.sample()
            act_.append(action)

            _, rew, done, info = e.step(action)
            returns += rew
            rew_.append(rew)

            if t == (e._horizon-1):
                timeout = True
                done = False
            else:
                timeout = False

            term_.append(done)
            timeout_.append(timeout)

            if done or timeout:
                e.reset()

            #e.env.mj_render() # this is much faster
            # e.render()
        ravg.append(returns)

    # write out hdf5 file
    obs_ = np.array(obs_).astype(np.float32)
    act_ = np.array(act_).astype(np.float32)
    rew_ = np.array(rew_).astype(np.float32)
    term_ = np.array(term_).astype(np.bool_)
    timeout_ = np.array(timeout_).astype(np.bool_)
    info_qpos_ = np.array(info_qpos_).astype(np.float32)
    info_qvel_ = np.array(info_qvel_).astype(np.float32)

    dataset = h5py.File('%s_random.hdf5' % env_name, 'w')

    #dataset.create_dataset('observations', obs_.shape, dtype='f4')
    dataset.create_dataset('observations', data=obs_, compression='gzip')
    dataset.create_dataset('actions', data=act_, compression='gzip')
    dataset.create_dataset('rewards', data=rew_, compression='gzip')
    dataset.create_dataset('terminals', data=term_, compression='gzip')
    dataset.create_dataset('timeouts', data=timeout_, compression='gzip')
    dataset.create_dataset('infos/qpos', data=info_qpos_, compression='gzip')
    dataset.create_dataset('infos/qvel', data=info_qvel_, compression='gzip')
    dataset.create_dataset('infos/env_state', data=np.array(info_env_state_, dtype=np.float32), compression='gzip')
コード例 #25
0
ファイル: walker_stl.py プロジェクト: Lifelong-ML/LPG-FTW
from mjrl.utils.gym_env import GymEnv
from mjrl.policies.gaussian_linear import LinearPolicy
from mjrl.baselines.mlp_baseline import MLPBaseline
from mjrl.algos.npg_cg import NPG
from mjrl.utils.train_agent import train_agent
import time as timer

SEED = 500

e = GymEnv('Walker2d-v2')
policy = LinearPolicy(e.spec, seed=SEED)
baseline = MLPBaseline(e.spec,
                       reg_coef=1e-3,
                       batch_size=64,
                       epochs=2,
                       learn_rate=1e-3)
agent = NPG(e,
            policy,
            baseline,
            normalized_step_size=0.1,
            seed=SEED,
            save_logs=True)

ts = timer.time()
train_agent(job_name='walker_nominal',
            agent=agent,
            seed=SEED,
            niter=500,
            gamma=0.995,
            gae_lambda=0.97,
            num_cpu=4,
コード例 #26
0
ファイル: visualizer_test.py プロジェクト: Divye02/mjrl-1
from mjrl.utils.gym_env import GymEnv
from mjrl.policies.gaussian_mlp import MLP
from mjrl.baselines.quadratic_baseline import QuadraticBaseline
from mjrl.baselines.mlp_baseline import MLPBaseline
from mjrl.algos.npg_cg import NPG
from mjrl.utils.train_agent import train_agent
import mjrl.envs
import time as timer
SEED = 500

e = GymEnv('mjrl_point_mass-v0')
policy = MLP(e.spec, hidden_sizes=(32, 32), seed=SEED)
baseline = QuadraticBaseline(e.spec)
agent = NPG(e,
            policy,
            baseline,
            normalized_step_size=0.2,
            seed=SEED,
            save_logs=True)

ts = timer.time()
train_agent(job_name='vis_exp',
            agent=agent,
            seed=SEED,
            niter=30,
            gamma=0.95,
            gae_lambda=0.97,
            num_cpu=1,
            sample_mode='trajectories',
            num_traj=100,
            save_freq=5,
コード例 #27
0
ファイル: mbac.py プロジェクト: zivzone/mjrl
class MBAC(BC):
    def __init__(
        self,
        env_name,
        policy,
        expert_paths=None,  # for the initial seeding
        epochs=5,
        batch_size=64,
        lr=1e-3,
        optimizer=None,
        loss_type='MSE',  # can be 'MLE' or 'MSE'
        seed=123,
        buffer_size=50,  # measured in number of trajectories
        mpc_params=None,
        save_logs=True,
    ):

        super().__init__(
            expert_paths=expert_paths,
            policy=policy,
            epochs=epochs,
            batch_size=batch_size,
            lr=lr,
            optimizer=optimizer,
            loss_type=loss_type,
            save_logs=save_logs,
        )
        self.expert_paths = [] if self.expert_paths is None else self.expert_paths
        self.buffer_size = buffer_size

        # For the MPC policy
        self.env = GymEnv(env_name)
        self.env.reset(seed=seed)
        if mpc_params is None:
            mean = np.zeros(self.env.action_dim)
            sigma = 1.0 * np.ones(self.env.action_dim)
            filter_coefs = [sigma, 0.05, 0.0, 0.0]
            mpc_params = dict(env=GymEnv(env_name),
                              H=10,
                              paths_per_cpu=25,
                              num_cpu=1,
                              kappa=10.0,
                              gamma=1.0,
                              mean=mean,
                              filter_coefs=filter_coefs,
                              seed=seed)
        else:
            mpc_params['env'] = GymEnv(env_name)
            mpc_params['seed'] = seed

        self.mpc_params = mpc_params
        self.mpc_policy = MPCActor(**mpc_params)

    def collect_paths(self,
                      num_traj=10,
                      mode='policy',
                      horizon=None,
                      render=False):
        horizon = self.env.horizon if horizon is None else horizon
        paths = []
        for i in tqdm(range(num_traj)):
            self.env.reset()
            obs, act_pi, act_mpc, rew, states = [], [], [], [], []
            for t in range(horizon):
                o = self.env.get_obs()
                s = self.env.get_env_state()
                a_pi = self.policy.get_action(o)[0]
                a_mpc = self.mpc_policy.get_action(s)
                a = a_pi if mode == 'policy' else a_mpc
                next_o, r, done, _ = self.env.step(a)
                if render:
                    self.env.render()
                # store data
                obs.append(o)
                rew.append(r)
                states.append(s)
                act_pi.append(a_pi)
                act_mpc.append(a_mpc)
                # kill if done
                if done:
                    break
            path = dict(
                observations=np.array(obs),
                actions=np.array(act_pi),
                expert_actions=np.array(act_mpc),
                rewards=np.array(rew),
                states=states,
            )
            paths.append(path)
        return paths

    def add_paths_to_buffer(self, paths):
        for path in paths:
            self.expert_paths.append(path)
        if len(self.expert_paths) > self.buffer_size:
            # keep recent trajectories
            # TODO: Also consider keeping best performing trajectories
            self.expert_paths = self.expert_paths[-self.buffer_size:]
        if self.save_logs:
            self.logger.log_kv('buffer_size', len(self.expert_paths))

    def get_data_from_buffer(self):
        observations = np.concatenate(
            [path["observations"] for path in self.expert_paths])
        expert_actions = np.concatenate(
            [path["expert_actions"] for path in self.expert_paths])
        observations = torch.Tensor(observations).float()
        expert_actions = torch.Tensor(expert_actions).float()
        data = dict(observations=observations, expert_actions=expert_actions)
        return data

    def train_step(self, num_traj=10, **kwargs):
        # collect data using policy actions
        # fit policy to expert actions on these states
        new_paths = self.collect_paths(num_traj, mode='policy')
        self.add_paths_to_buffer(new_paths)
        data = self.get_data_from_buffer()
        self.fit(data, **kwargs)
        stoc_pol_perf = np.mean(
            [np.sum(path['rewards']) for path in new_paths])
        return stoc_pol_perf
コード例 #28
0
def do_rollout(num_traj,
               env,
               policy,
               eval_mode=False,
               horizon=1e6,
               base_seed=None,
               env_kwargs=None,
               init_states_per_cpu=None):
    """
    :param num_traj:    number of trajectories (int)
    :param env:         environment (env class, str with env_name, or factory function)
    :param policy:      policy to use for action selection
    :param eval_mode:   use evaluation mode for action computation (bool)
    :param horizon:     max horizon length for rollout (<= env.horizon)
    :param base_seed:   base seed for rollouts (int)
    :param env_kwargs:  dictionary with parameters, will be passed to env generator
    :param init_states_per_cpu: list of init states to initialize from for fixed evaluation
    :return:
    """

    # get the correct env behavior
    if type(env) == str:
        if isinstance(env_kwargs, dict):
            env = GymEnv(env, **env_kwargs)
        else:
            env = GymEnv(env)
    elif isinstance(env, GymEnv):
        env = env
    elif callable(env):
        env = env(**env_kwargs)
    else:
        print("Unsupported environment format")
        raise AttributeError

    if base_seed is not None:
        env.set_seed(base_seed)
        np.random.seed(base_seed)
    else:
        np.random.seed()
    horizon = min(horizon, env.horizon)
    paths = []

    for ep in range(num_traj):
        # seeding
        if base_seed is not None:
            seed = base_seed + ep
            env.set_seed(seed)
            np.random.seed(seed)

        observations = []
        actions = []
        rewards = []
        agent_infos = []
        env_infos = []

        o = env.reset()
        if init_states_per_cpu is not None:
            o = env.set_env_state(init_states_per_cpu[ep])
            assert o is not None, 'set_env_state of env ' + env.env_id + ' returns None, should return observation'
        done = False
        t = 0

        while t < horizon and done != True:
            a, agent_info = policy.get_action(o)
            if eval_mode:
                a = agent_info['evaluation']
            env_info_base = env.get_env_infos()
            next_o, r, done, env_info_step = env.step(a)
            # below is important to ensure correct env_infos for the timestep
            env_info = env_info_step if env_info_base == {} else env_info_base
            observations.append(o)
            actions.append(a)
            rewards.append(r)
            agent_infos.append(agent_info)
            env_infos.append(env_info)
            o = next_o
            t += 1

        path = dict(
            observations=np.array(observations),
            actions=np.array(actions),
            rewards=np.array(rewards),
            agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
            env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
            terminated=done)
        paths.append(path)

    del (env)
    return paths
コード例 #29
0
ファイル: metaworld_er.py プロジェクト: Lifelong-ML/LPG-FTW
e_unshuffled = {}

for task_id, (env_id, entry_point) in enumerate(env_dict.items()):
    kwargs = {'obs_type': 'plain'}
    if env_id == 'reach-v1':
        kwargs['task_type'] = 'reach'
    elif env_id == 'push-v1':
        kwargs['task_type'] = 'push'
    elif env_id == 'pick-place-v1':
        kwargs['task_type'] = 'pick_place'
    gym.envs.register(id=env_id,
                      entry_point='metaworld.envs.mujoco.sawyer_xyz.' +
                      entry_point,
                      max_episode_steps=150,
                      kwargs=kwargs)
    e_unshuffled[task_id] = GymEnv(env_id)

for i in range(num_seeds):
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    job_name_er_seed = job_name_er + '/seed_{}'.format(i)
    e = {}
    task_order = np.random.permutation(num_tasks)
    for task_id in range(num_tasks):
        e[task_id] = e_unshuffled[task_order[task_id]]

    baseline_mtl = {}
    forward_transfer_results = {}
    for task_id in range(num_tasks):
        iterdir = job_name_er_seed + '/iterations/task_{}/'.format(task_id)
        f = open(iterdir + 'policy_199.pickle', 'rb')
コード例 #30
0
ファイル: job_script.py プロジェクト: yangdeai/hand_dapg
    job_data = eval(f.read())
assert 'algorithm' in job_data.keys()
assert any([job_data['algorithm'] == a for a in ['NPG', 'BCRL', 'DAPG']])
job_data['lam_0'] = 0.0 if 'lam_0' not in job_data.keys(
) else job_data['lam_0']
job_data['lam_1'] = 0.0 if 'lam_1' not in job_data.keys(
) else job_data['lam_1']
EXP_FILE = JOB_DIR + '/job_config.json'
with open(EXP_FILE, 'w') as f:
    json.dump(job_data, f, indent=4)

# ===============================================================================
# Train Loop
# ===============================================================================

e = GymEnv(job_data['env'])
policy = MLP(e.spec,
             hidden_sizes=job_data['policy_size'],
             seed=job_data['seed'])
baseline = MLPBaseline(e.spec,
                       reg_coef=1e-3,
                       batch_size=job_data['vf_batch_size'],
                       epochs=job_data['vf_epochs'],
                       learn_rate=job_data['vf_learn_rate'])

# Get demonstration data if necessary and behavior clone
if job_data['algorithm'] != 'NPG':
    print("========================================")
    print("Collecting expert demonstrations")
    print("========================================")
    demo_paths = pickle.load(open(job_data['demo_file'], 'rb'))