Exemple #1
0
def main(config):
    set_seed(config['seed'])
    #experiment_log_dir = setup_logger(config['env'], variant=config, exp_id=None,
    #                                  base_log_dir=config['base_log_dir'])

    baseline =  globals()[config['baseline']]() #instantiate baseline

    env = globals()[config['env']]() # instantiate env
    env = normalize(env) # apply normalize wrapper to env

    policy = MetaGaussianMLPPolicy(
            name="meta-policy",
            obs_dim=np.prod(env.observation_space.shape),
            action_dim=np.prod(env.action_space.shape),
            meta_batch_size=config['meta_batch_size'],
            hidden_sizes=config['hidden_sizes'],
        )

    sampler = MetaSampler(
        env=env,
        policy=policy,
        rollouts_per_meta_task=config['rollouts_per_meta_task'],  # This batch_size is confusing
        meta_batch_size=config['meta_batch_size'],
        max_path_length=config['max_path_length'],
        parallel=config['parallel'],
    )

    sample_processor = MetaSampleProcessor(
        baseline=baseline,
        discount=config['discount'],
        gae_lambda=config['gae_lambda'],
        normalize_adv=config['normalize_adv'],
    )

    algo = ProMP(
        policy=policy,
        inner_lr=config['inner_lr'],
        meta_batch_size=config['meta_batch_size'],
        num_inner_grad_steps=config['num_inner_grad_steps'],
        learning_rate=config['learning_rate'],
        num_ppo_steps=config['num_promp_steps'],
        clip_eps=config['clip_eps'],
        target_inner_step=config['target_inner_step'],
        init_inner_kl_penalty=config['init_inner_kl_penalty'],
        adaptive_inner_kl_penalty=config['adaptive_inner_kl_penalty'],
    )

    trainer = Trainer(
        algo=algo,
        policy=policy,
        env=env,
        sampler=sampler,
        sample_processor=sample_processor,
        n_itr=config['n_itr'],
        num_inner_grad_steps=config['num_inner_grad_steps'],
    )



    trainer.train()
Exemple #2
0
def main(config):
    set_seed(config['seed'])

    reward_baseline = LinearTimeBaseline()  # the usual baseline
    return_baseline = LinearFeatureBaseline(
    )  # the additional baseline for DICE

    env = globals()[config['env']]()  # instantiate env
    env = normalize(env)  # apply normalize wrapper to env

    meta_baseline = MetaNNBaseline(
        input_size=env.observation_space.shape[0])  # the meta baseline

    policy = MetaGaussianMLPPolicy(
        name="meta-policy",
        obs_dim=np.prod(env.observation_space.shape),
        action_dim=np.prod(env.action_space.shape),
        meta_batch_size=config['meta_batch_size'],
        hidden_sizes=config['hidden_sizes'],
    )

    sampler = MetaSampler(
        env=env,
        policy=policy,
        rollouts_per_meta_task=config['rollouts_per_meta_task'],
        meta_batch_size=config['meta_batch_size'],
        max_path_length=config['max_path_length'],
        parallel=config['parallel'],
    )

    sample_processor = TMAMLMetaSampleProcessor(
        baseline=reward_baseline,
        max_path_length=config['max_path_length'],
        discount=config['discount'],
        normalize_adv=config['normalize_adv'],
        positive_adv=config['positive_adv'],
        return_baseline=return_baseline,
        metabaseline=meta_baseline,
    )

    algo = TMAML(policy=policy,
                 max_path_length=config['max_path_length'],
                 meta_batch_size=config['meta_batch_size'],
                 num_inner_grad_steps=config['num_inner_grad_steps'],
                 inner_lr=config['inner_lr'],
                 learning_rate=config['learning_rate'])

    trainer = Trainer(
        algo=algo,
        policy=policy,
        env=env,
        sampler=sampler,
        sample_processor=sample_processor,
        n_itr=config['n_itr'],
        num_inner_grad_steps=config['num_inner_grad_steps'],
    )

    trainer.train()
Exemple #3
0
def main(config):
    # config['seed'] = 4
    experiment.set_name("short meta saving test")
    set_seed(config['seed'])
    experiment.log_parameters(config)
    # experiment.log_parameter("task limit size", 3)

    # experiment.log_metric("seed", config['seed'])
    baseline = globals()[config['baseline']]()  #instantiate baseline

    env = globals()[config['env']]()  # instantiate env
    env = normalize(env)  # apply normalize wrapper to env

    with open('/saved_policies/mjvel.policy', 'rb') as policy_file:
        policy = pickle.load(policy_file)
        print("policy loaded")

    sampler = MetaSampler(
        env=env,
        policy=policy,
        rollouts_per_meta_task=config[
            'rollouts_per_meta_task'],  # This batch_size is confusing
        meta_batch_size=config['meta_batch_size'],
        max_path_length=config['max_path_length'],
        parallel=config['parallel'],
    )

    sample_processor = MetaSampleProcessor(
        baseline=baseline,
        discount=config['discount'],
        gae_lambda=config['gae_lambda'],
        normalize_adv=config['normalize_adv'],
    )

    algo = ProMP(
        policy=policy,
        inner_lr=config['inner_lr'],
        meta_batch_size=config['meta_batch_size'],
        num_inner_grad_steps=config['num_inner_grad_steps'],
        learning_rate=config['learning_rate'],
        num_ppo_steps=config['num_promp_steps'],
        clip_eps=config['clip_eps'],
        target_inner_step=config['target_inner_step'],
        init_inner_kl_penalty=config['init_inner_kl_penalty'],
        adaptive_inner_kl_penalty=config['adaptive_inner_kl_penalty'],
    )

    trainer = RLTrainer(algo=algo,
                        policy=policy,
                        env=env,
                        sampler=sampler,
                        sample_processor=sample_processor,
                        n_itr=config['n_itr'],
                        num_inner_grad_steps=config['num_inner_grad_steps'],
                        experiment=experiment)

    trainer.train()
Exemple #4
0
def main(config):
    set_seed(config['seed'])

    baseline = globals()[config['baseline']]()  #instantiate baseline

    env = globals()[config['env']]()  # instantiate env
    env = normalize(env)  # apply normalize wrapper to env

    policy = MetaGaussianMLPPolicy(
        name="meta-policy",
        obs_dim=np.prod(env.observation_space.shape),
        action_dim=np.prod(env.action_space.shape),
        meta_batch_size=config['meta_batch_size'],
        hidden_sizes=config['hidden_sizes'],
    )

    sampler = MetaSampler(
        env=env,
        policy=policy,
        rollouts_per_meta_task=config[
            'rollouts_per_meta_task'],  # This batch_size is confusing
        meta_batch_size=config['meta_batch_size'],
        max_path_length=config['max_path_length'],
        parallel=config['parallel'],
    )

    sample_processor = MetaSampleProcessor(
        baseline=baseline,
        discount=config['discount'],
        gae_lambda=config['gae_lambda'],
        normalize_adv=config['normalize_adv'],
    )

    algo = VPGMAML(
        policy=policy,
        learning_rate=config['learning_rate'],
        inner_type=config['inner_type'],
        inner_lr=config['inner_lr'],
        meta_batch_size=config['meta_batch_size'],
        num_inner_grad_steps=config['num_inner_grad_steps'],
        exploration=False,
    )

    trainer = Trainer(
        algo=algo,
        policy=policy,
        env=env,
        sampler=sampler,
        sample_processor=sample_processor,
        n_itr=config['n_itr'],
        num_inner_grad_steps=config['num_inner_grad_steps'],
    )

    trainer.train()
 def __init__(self, env, meta_batch_size, envs_per_task, max_path_length):
     self.envs = []
     print ("env:", env)
     sys.exit()
     if (env is None):
         for _ in range(meta_batch_size * envs_per_task):
             env = terrainRLSim.getEnv(env_name="PD_Humanoid_3D_GRF_Mixed_1Sub_Imitate_30FPS_DenseState_v0", render=True)
             # env = globals()[config['env']]() # instantiate env
             env = normalize(env) # apply normalize wrapper to env
             self.envs.append(env)
     else:
         self.envs = np.asarray([copy.deepcopy(env) for _ in range(meta_batch_size * envs_per_task)])
     self.ts = np.zeros(len(self.envs), dtype='int')  # time steps
     self.max_path_length = max_path_length
Exemple #6
0
def run_experiment(**kwargs):
    exp_dir = os.getcwd() + '/data/' + EXP_NAME
    logger.configure(dir=exp_dir,
                     format_strs=['stdout', 'log', 'csv'],
                     snapshot_mode='last_gap',
                     snapshot_gap=50)
    json.dump(kwargs,
              open(exp_dir + '/params.json', 'w'),
              indent=2,
              sort_keys=True,
              cls=ClassEncoder)

    # Instantiate classes
    set_seed(kwargs['seed'])

    baseline = kwargs['baseline']()

    env = normalize(kwargs['env']())  # Wrappers?

    policy = MetaGaussianMLPPolicy(
        name="meta-policy",
        obs_dim=np.prod(env.observation_space.shape),  # Todo...?
        action_dim=np.prod(env.action_space.shape),
        meta_batch_size=kwargs['meta_batch_size'],
        hidden_sizes=kwargs['hidden_sizes'],
        learn_std=kwargs['learn_std'],
        hidden_nonlinearity=kwargs['hidden_nonlinearity'],
        output_nonlinearity=kwargs['output_nonlinearity'],
    )

    # Load policy here

    sampler = MAMLSampler(
        env=env,
        policy=policy,
        rollouts_per_meta_task=kwargs['rollouts_per_meta_task'],
        meta_batch_size=kwargs['meta_batch_size'],
        max_path_length=kwargs['max_path_length'],
        parallel=kwargs['parallel'],
        envs_per_task=1,
    )

    sample_processor = MAMLSampleProcessor(
        baseline=baseline,
        discount=kwargs['discount'],
        gae_lambda=kwargs['gae_lambda'],
        normalize_adv=kwargs['normalize_adv'],
        positive_adv=kwargs['positive_adv'],
    )

    algo = TRPOMAML(
        policy=policy,
        step_size=kwargs['step_size'],
        inner_type=kwargs['experiment_tuple'][1],
        inner_lr=kwargs['inner_lr'],
        meta_batch_size=kwargs['meta_batch_size'],
        num_inner_grad_steps=kwargs['num_inner_grad_steps'],
        exploration=kwargs['experiment_tuple'][2],
    )

    trainer = Trainer(
        algo=algo,
        policy=policy,
        env=env,
        sampler=sampler,
        sample_processor=sample_processor,
        n_itr=kwargs['n_itr'],
        num_inner_grad_steps=kwargs['num_inner_grad_steps'],
    )

    trainer.train()
Exemple #7
0
def main(config):
    set_seed(config['seed'])

    baseline = globals()[config['baseline']]()  #instantiate baseline

    env = globals()[config['env']]()  # instantiate env
    env = normalize(env)  # apply normalize wrapper to env

    policy = MetaGaussianMLPPolicy(
        name="meta-policy",
        obs_dim=np.prod(env.observation_space.shape),
        action_dim=np.prod(env.action_space.shape),
        meta_batch_size=config['meta_batch_size'],
        hidden_sizes=config['hidden_sizes'],
    )

    sampler = MetaSampler(
        env=env,
        policy=policy,
        rollouts_per_meta_task=config[
            'rollouts_per_meta_task'],  # This batch_size is confusing
        meta_batch_size=config['meta_batch_size'],
        max_path_length=config['max_path_length'],
        parallel=config['parallel'],
    )

    sample_processor = MetaSampleProcessor(
        baseline=baseline,
        discount=config['discount'],
        gae_lambda=config['gae_lambda'],
        normalize_adv=config['normalize_adv'],
    )

    algo = ProMP(
        policy=policy,
        inner_lr=config['inner_lr'],
        meta_batch_size=config['meta_batch_size'],
        num_inner_grad_steps=config['num_inner_grad_steps'],
        learning_rate=config['learning_rate'],
        num_ppo_steps=config['num_promp_steps'],
        clip_eps=config['clip_eps'],
        target_inner_step=config['target_inner_step'],
        init_inner_kl_penalty=config['init_inner_kl_penalty'],
        adaptive_inner_kl_penalty=config['adaptive_inner_kl_penalty'],
    )

    gpu_config = tf.ConfigProto()
    gpu_config.gpu_options.allow_growth = True  # pylint: disable=E1101
    sess = tf.Session(config=gpu_config)

    saver = tf.train.Saver(
        keep_checkpoint_every_n_hours=config['keep_checkpoint_every_n_hours'],
        max_to_keep=config['max_checkpoints_to_keep'])

    save_path = os.path.join(args.dump_path, 'model.ckpt')

    if config['restore_path'] is not None:
        logger.log('Restoring parameters from {}'.format(
            config['restore_path']))
        saver.restore(sess, config['restore_path'])
        logger.log('Restored')

    trainer = Trainer(
        algo=algo,
        policy=policy,
        env=env,
        sampler=sampler,
        sample_processor=sample_processor,
        saver=saver,
        save_path=save_path,
        save_steps=config['save_steps'],
        n_itr=config['n_itr'],
        num_inner_grad_steps=config['num_inner_grad_steps'],
        sess=sess,
    )

    trainer.train()
def main(config):
    # config['seed'] = 4ß
    # experiment.set_name("pos task only, size = 15, logging vel")
    set_seed(config['seed'])
    # experiment.log_parameters(config)
    # experiment.log_parameter("task limit size", 3)

    # experiment.log_metric("seed", config['seed'])
    baseline = globals()[config['baseline']]()  #instantiate baseline

    env = globals()[config['env']]()  # instantiate env
    print("env: ", env.sample_tasks)
    TASKSL1 = np.array([0, -0.3])
    env.set_tasks(TASKSL1)
    env = normalize(env)  # apply normalize wrapper to env

    policy = MetaGaussianMLPPolicy(
        name="meta-policy",
        obs_dim=np.prod(env.observation_space.shape),
        action_dim=np.prod(env.action_space.shape),
        meta_batch_size=config['meta_batch_size'],
        hidden_sizes=config['hidden_sizes'],
    )

    sampler = MetaSampler(
        env=env,
        policy=policy,
        rollouts_per_meta_task=config[
            'rollouts_per_meta_task'],  # This batch_size is confusing
        meta_batch_size=config['meta_batch_size'],
        max_path_length=config['max_path_length'],
        parallel=config['parallel'],
    )

    sample_processor = MetaSampleProcessor(
        baseline=baseline,
        discount=config['discount'],
        gae_lambda=config['gae_lambda'],
        normalize_adv=config['normalize_adv'],
    )

    algo = ProMP(
        policy=policy,
        inner_lr=config['inner_lr'],
        meta_batch_size=config['meta_batch_size'],
        num_inner_grad_steps=config['num_inner_grad_steps'],
        learning_rate=config['learning_rate'],
        num_ppo_steps=config['num_promp_steps'],
        clip_eps=config['clip_eps'],
        target_inner_step=config['target_inner_step'],
        init_inner_kl_penalty=config['init_inner_kl_penalty'],
        adaptive_inner_kl_penalty=config['adaptive_inner_kl_penalty'],
    )

    trainer = Trainer(
        algo=algo,
        policy=policy,
        env=env,
        sampler=sampler,
        sample_processor=sample_processor,
        n_itr=config['n_itr'],
        num_inner_grad_steps=config['num_inner_grad_steps'],
    )

    trainer.train()
Exemple #9
0
def main(config):

    experiment.set_name("short meta saving test")
    set_seed(config['seed'])
    experiment.log_parameters(config)

    baseline = globals()[config['baseline']]()  #instantiate baseline

    env = terrainRLSim.getEnv(env_name=None, render=False)
    # env = normalize(env) # apply normalize wrapper to env

    policy = MetaGaussianMLPPolicy(
        name="meta-policy",
        obs_dim=np.prod((104, )),
        action_dim=np.prod((11, )),
        meta_batch_size=config['meta_batch_size'],
        hidden_sizes=config['hidden_sizes'],
    )

    sampler = MetaSampler(
        env=('terrianrlSim', config['env']),
        policy=policy,
        rollouts_per_meta_task=config[
            'rollouts_per_meta_task'],  # This batch_size is confusing
        meta_batch_size=config['meta_batch_size'],
        max_path_length=config['max_path_length'],
        parallel=config['parallel'],
    )
    env = terrainRLSim.getEnv(env_name=config['env'], render=False)
    # env = globals()[config['env']]() # instantiate env
    env = normalize(env)  # apply normalize wrapper to env
    print("env.observation_space.shape: ", env.observation_space.shape)
    print("env.action_space.shape: ", env.action_space.shape)
    sampler.set_env(env)

    sample_processor = MetaSampleProcessor(
        baseline=baseline,
        discount=config['discount'],
        gae_lambda=config['gae_lambda'],
        normalize_adv=config['normalize_adv'],
    )

    algo = ProMP(
        policy=policy,
        inner_lr=config['inner_lr'],
        meta_batch_size=config['meta_batch_size'],
        num_inner_grad_steps=config['num_inner_grad_steps'],
        learning_rate=config['learning_rate'],
        num_ppo_steps=config['num_promp_steps'],
        clip_eps=config['clip_eps'],
        target_inner_step=config['target_inner_step'],
        init_inner_kl_penalty=config['init_inner_kl_penalty'],
        adaptive_inner_kl_penalty=config['adaptive_inner_kl_penalty'],
    )

    trainer = Trainer(algo=algo,
                      policy=policy,
                      env=env,
                      sampler=sampler,
                      sample_processor=sample_processor,
                      n_itr=config['n_itr'],
                      num_inner_grad_steps=config['num_inner_grad_steps'],
                      experiment=experiment)

    trainer.train()
def worker(remote, parent_remote, env_pickle, n_envs, max_path_length, seed):
    """
    Instantiation of a parallel worker for collecting samples. It loops continually checking the task that the remote
    sends to it.

    Args:
        remote (multiprocessing.Connection):
        parent_remote (multiprocessing.Connection):
        env_pickle (pkl): pickled environment
        n_envs (int): number of environments per worker
        max_path_length (int): maximum path length of the task
        seed (int): random seed for the worker
    """
    parent_remote.close()
    # print ("env_pickle: ", env_pickle)
    # sys.exit()
    envs = []
    if type(env_pickle) is tuple:
        for _ in range(n_envs):
            if (env_pickle[0] == 'terrianrlSim'):
                env = terrainRLSim.getEnv(env_name=env_pickle[1], render=False)
                # env = globals()[config['env']]() # instantiate env
                env = normalize(env)  # apply normalize wrapper to env
                envs.append(env)
    else:
        envs = [pickle.loads(env_pickle) for _ in range(n_envs)]

    np.random.seed(seed)

    ts = np.zeros(n_envs, dtype='int')

    while True:
        # receive command and data from the remote
        cmd, data = remote.recv()

        # do a step in each of the environment of the worker
        if cmd == 'step':
            all_results = [env.step(a) for (a, env) in zip(data, envs)]
            obs, rewards, dones, infos = map(list, zip(*all_results))
            ts += 1
            for i in range(n_envs):
                if dones[i] or (ts[i] >= max_path_length):
                    dones[i] = True
                    obs[i] = envs[i].reset()
                    ts[i] = 0
            remote.send((obs, rewards, dones, infos))

        # reset all the environments of the worker
        elif cmd == 'reset':
            obs = [env.reset() for env in envs]
            ts[:] = 0
            remote.send(obs)

        # set the specified task for each of the environments of the worker
        elif cmd == 'set_task':
            for env in envs:
                env.set_task(data)
            remote.send(None)

        # close the remote and stop the worker
        elif cmd == 'close':
            remote.close()
            break

        else:
            raise NotImplementedError
Exemple #11
0
    parser.add_argument('--video_filename', default=None)
    parser.add_argument('--num_trajs', type=int, default=10)
    args = parser.parse_args(sys.argv[1:])

    params_path = os.path.join(
        os.path.split(args.restore_path)[0], 'params.json')

    with open(params_path, 'r') as f:
        params = json.load(f)

    params.update(args.overrides)

    baseline = LinearFeatureBaseline()

    env = globals()[params['env']]()  # instantiate env
    env = normalize(env)  # apply normalize wrapper to env

    gpu_config = tf.ConfigProto()
    gpu_config.gpu_options.allow_growth = True  # pylint: disable=E1101
    sess = tf.Session(config=gpu_config)

    policy = MetaGaussianMLPPolicy(
        name="meta-policy",
        obs_dim=np.prod(env.observation_space.shape),
        action_dim=np.prod(env.action_space.shape),
        meta_batch_size=params['meta_batch_size'],
        hidden_sizes=params['hidden_sizes'],
        cell_size=params['cell_size'],
        rollouts_per_meta_task=params['rollouts_per_meta_task'],
        max_path_length=params['max_path_length'],
        use_betas=params['use_betas'],