コード例 #1
0
def main(config):
    set_seed(config['seed'])
    #experiment_log_dir = setup_logger(config['env'], variant=config, exp_id=None,
    #                                  base_log_dir=config['base_log_dir'])

    baseline =  globals()[config['baseline']]() #instantiate baseline

    env = globals()[config['env']]() # instantiate env
    env = normalize(env) # apply normalize wrapper to env

    policy = MetaGaussianMLPPolicy(
            name="meta-policy",
            obs_dim=np.prod(env.observation_space.shape),
            action_dim=np.prod(env.action_space.shape),
            meta_batch_size=config['meta_batch_size'],
            hidden_sizes=config['hidden_sizes'],
        )

    sampler = MetaSampler(
        env=env,
        policy=policy,
        rollouts_per_meta_task=config['rollouts_per_meta_task'],  # This batch_size is confusing
        meta_batch_size=config['meta_batch_size'],
        max_path_length=config['max_path_length'],
        parallel=config['parallel'],
    )

    sample_processor = MetaSampleProcessor(
        baseline=baseline,
        discount=config['discount'],
        gae_lambda=config['gae_lambda'],
        normalize_adv=config['normalize_adv'],
    )

    algo = ProMP(
        policy=policy,
        inner_lr=config['inner_lr'],
        meta_batch_size=config['meta_batch_size'],
        num_inner_grad_steps=config['num_inner_grad_steps'],
        learning_rate=config['learning_rate'],
        num_ppo_steps=config['num_promp_steps'],
        clip_eps=config['clip_eps'],
        target_inner_step=config['target_inner_step'],
        init_inner_kl_penalty=config['init_inner_kl_penalty'],
        adaptive_inner_kl_penalty=config['adaptive_inner_kl_penalty'],
    )

    trainer = Trainer(
        algo=algo,
        policy=policy,
        env=env,
        sampler=sampler,
        sample_processor=sample_processor,
        n_itr=config['n_itr'],
        num_inner_grad_steps=config['num_inner_grad_steps'],
    )



    trainer.train()
コード例 #2
0
def main(config):
    set_seed(config['seed'])

    reward_baseline = LinearTimeBaseline()  # the usual baseline
    return_baseline = LinearFeatureBaseline(
    )  # the additional baseline for DICE

    env = globals()[config['env']]()  # instantiate env
    env = normalize(env)  # apply normalize wrapper to env

    meta_baseline = MetaNNBaseline(
        input_size=env.observation_space.shape[0])  # the meta baseline

    policy = MetaGaussianMLPPolicy(
        name="meta-policy",
        obs_dim=np.prod(env.observation_space.shape),
        action_dim=np.prod(env.action_space.shape),
        meta_batch_size=config['meta_batch_size'],
        hidden_sizes=config['hidden_sizes'],
    )

    sampler = MetaSampler(
        env=env,
        policy=policy,
        rollouts_per_meta_task=config['rollouts_per_meta_task'],
        meta_batch_size=config['meta_batch_size'],
        max_path_length=config['max_path_length'],
        parallel=config['parallel'],
    )

    sample_processor = TMAMLMetaSampleProcessor(
        baseline=reward_baseline,
        max_path_length=config['max_path_length'],
        discount=config['discount'],
        normalize_adv=config['normalize_adv'],
        positive_adv=config['positive_adv'],
        return_baseline=return_baseline,
        metabaseline=meta_baseline,
    )

    algo = TMAML(policy=policy,
                 max_path_length=config['max_path_length'],
                 meta_batch_size=config['meta_batch_size'],
                 num_inner_grad_steps=config['num_inner_grad_steps'],
                 inner_lr=config['inner_lr'],
                 learning_rate=config['learning_rate'])

    trainer = Trainer(
        algo=algo,
        policy=policy,
        env=env,
        sampler=sampler,
        sample_processor=sample_processor,
        n_itr=config['n_itr'],
        num_inner_grad_steps=config['num_inner_grad_steps'],
    )

    trainer.train()
コード例 #3
0
def main(config):
    # config['seed'] = 4
    experiment.set_name("short meta saving test")
    set_seed(config['seed'])
    experiment.log_parameters(config)
    # experiment.log_parameter("task limit size", 3)

    # experiment.log_metric("seed", config['seed'])
    baseline = globals()[config['baseline']]()  #instantiate baseline

    env = globals()[config['env']]()  # instantiate env
    env = normalize(env)  # apply normalize wrapper to env

    with open('/saved_policies/mjvel.policy', 'rb') as policy_file:
        policy = pickle.load(policy_file)
        print("policy loaded")

    sampler = MetaSampler(
        env=env,
        policy=policy,
        rollouts_per_meta_task=config[
            'rollouts_per_meta_task'],  # This batch_size is confusing
        meta_batch_size=config['meta_batch_size'],
        max_path_length=config['max_path_length'],
        parallel=config['parallel'],
    )

    sample_processor = MetaSampleProcessor(
        baseline=baseline,
        discount=config['discount'],
        gae_lambda=config['gae_lambda'],
        normalize_adv=config['normalize_adv'],
    )

    algo = ProMP(
        policy=policy,
        inner_lr=config['inner_lr'],
        meta_batch_size=config['meta_batch_size'],
        num_inner_grad_steps=config['num_inner_grad_steps'],
        learning_rate=config['learning_rate'],
        num_ppo_steps=config['num_promp_steps'],
        clip_eps=config['clip_eps'],
        target_inner_step=config['target_inner_step'],
        init_inner_kl_penalty=config['init_inner_kl_penalty'],
        adaptive_inner_kl_penalty=config['adaptive_inner_kl_penalty'],
    )

    trainer = RLTrainer(algo=algo,
                        policy=policy,
                        env=env,
                        sampler=sampler,
                        sample_processor=sample_processor,
                        n_itr=config['n_itr'],
                        num_inner_grad_steps=config['num_inner_grad_steps'],
                        experiment=experiment)

    trainer.train()
コード例 #4
0
def main(config):
    set_seed(config['seed'])
    sess = tf.Session()

    with sess.as_default() as sess:

        data = joblib.load(load_path + "/params.pkl")
        policy = data['policy']
        env = data['env']
        baseline = data['baseline']

        # config['meta_batch_size'] = env.NUM_EVAL
        # policy.meta_batch_size = env.NUM_EVAL

        sampler = MetaSampler(
            env=env,
            policy=policy,
            rollouts_per_meta_task=config[
                'rollouts_per_meta_task'],  # Will be modified later
            meta_batch_size=config['meta_batch_size'],
            max_path_length=config['max_path_length'],
            parallel=config['parallel'],
        )

        sample_processor = MetaSampleProcessor(
            baseline=baseline,
            discount=config['discount'],
            gae_lambda=config['gae_lambda'],
            normalize_adv=config['normalize_adv'],
        )

        algo = ProMP(
            policy=policy,
            inner_lr=config['inner_lr'],
            meta_batch_size=config['meta_batch_size'],
            num_inner_grad_steps=config['num_inner_grad_steps'],
            learning_rate=config['learning_rate'],
            num_ppo_steps=config['num_promp_steps'],
            clip_eps=config['clip_eps'],
            target_inner_step=config['target_inner_step'],
            init_inner_kl_penalty=config['init_inner_kl_penalty'],
            adaptive_inner_kl_penalty=config['adaptive_inner_kl_penalty'],
        )

        tester = Tester(
            algo=algo,
            policy=policy,
            env=env,
            sampler=sampler,
            sample_processor=sample_processor,
            #n_itr=config['n_itr'],
            eff=config['eff'],
            num_inner_grad_steps=config['num_inner_grad_steps'],
        )

        tester.train()
        sess.close()
コード例 #5
0
ファイル: norm_vpg.py プロジェクト: MinorJerry/NG_MAML
def main(config):
    set_seed(config['seed'])

    baseline = globals()[config['baseline']]()  #instantiate baseline

    env = globals()[config['env']]()  # instantiate env
    env = normalize(env)  # apply normalize wrapper to env

    policy = MetaGaussianMLPPolicy(
        name="meta-policy",
        obs_dim=np.prod(env.observation_space.shape),
        action_dim=np.prod(env.action_space.shape),
        meta_batch_size=config['meta_batch_size'],
        hidden_sizes=config['hidden_sizes'],
    )

    sampler = MetaSampler(
        env=env,
        policy=policy,
        rollouts_per_meta_task=config[
            'rollouts_per_meta_task'],  # This batch_size is confusing
        meta_batch_size=config['meta_batch_size'],
        max_path_length=config['max_path_length'],
        parallel=config['parallel'],
    )

    sample_processor = MetaSampleProcessor(
        baseline=baseline,
        discount=config['discount'],
        gae_lambda=config['gae_lambda'],
        normalize_adv=config['normalize_adv'],
    )

    algo = VPGMAML(
        policy=policy,
        learning_rate=config['learning_rate'],
        inner_type=config['inner_type'],
        inner_lr=config['inner_lr'],
        meta_batch_size=config['meta_batch_size'],
        num_inner_grad_steps=config['num_inner_grad_steps'],
        exploration=False,
    )

    trainer = Trainer(
        algo=algo,
        policy=policy,
        env=env,
        sampler=sampler,
        sample_processor=sample_processor,
        n_itr=config['n_itr'],
        num_inner_grad_steps=config['num_inner_grad_steps'],
    )

    trainer.train()
コード例 #6
0
 def setUp(self):
     self.random_env = RandomEnv()
     self.random_policy = RandomPolicy(1, 1)
     self.meta_batch_size = 2
     self.batch_size = 10
     self.path_length = 100
     self.linear = LinearFeatureBaseline()
     self.sampler = MetaSampler(self.random_env,
                                self.random_policy,
                                self.batch_size,
                                self.meta_batch_size,
                                self.path_length,
                                parallel=True)
コード例 #7
0
    def setUp(self):
        self.env = env = MetaPointEnv()

        self.baseline = baseline = LinearFeatureBaseline()

        self.policy = policy = MetaGaussianMLPPolicy(
            name="meta-policy",
            obs_dim=np.prod(env.observation_space.shape),
            action_dim=np.prod(env.action_space.shape),
            meta_batch_size=10,
            hidden_sizes=(16, 16),
            learn_std=True,
            hidden_nonlinearity=tf.tanh,
            output_nonlinearity=None,
        )

        self.sampler = MetaSampler(
            env=env,
            policy=policy,
            rollouts_per_meta_task=2,
            meta_batch_size=10,
            max_path_length=50,
            parallel=False,
        )

        self.sample_processor = MetaSampleProcessor(
            baseline=baseline,
            discount=0.99,
            gae_lambda=1.0,
            normalize_adv=True,
            positive_adv=False,
        )

        self.algo = ProMP(
            policy=policy,
            inner_lr=0.1,
            meta_batch_size=10,
            num_inner_grad_steps=2,
            learning_rate=1e-3,
            num_ppo_steps=5,
            num_minibatches=1,
            clip_eps=0.5,
            target_inner_step=2e-2,
            init_inner_kl_penalty=1e-3,
        )
コード例 #8
0
def main(config):
    set_seed(config['seed'])

    baseline = globals()[config['baseline']]()  #instantiate baseline

    env = globals()[config['env']]()  # instantiate env
    env = normalize(env)  # apply normalize wrapper to env

    policy = MetaGaussianMLPPolicy(
        name="meta-policy",
        obs_dim=np.prod(env.observation_space.shape),
        action_dim=np.prod(env.action_space.shape),
        meta_batch_size=config['meta_batch_size'],
        hidden_sizes=config['hidden_sizes'],
    )

    sampler = MetaSampler(
        env=env,
        policy=policy,
        rollouts_per_meta_task=config[
            'rollouts_per_meta_task'],  # This batch_size is confusing
        meta_batch_size=config['meta_batch_size'],
        max_path_length=config['max_path_length'],
        parallel=config['parallel'],
    )

    sample_processor = MetaSampleProcessor(
        baseline=baseline,
        discount=config['discount'],
        gae_lambda=config['gae_lambda'],
        normalize_adv=config['normalize_adv'],
    )

    algo = ProMP(
        policy=policy,
        inner_lr=config['inner_lr'],
        meta_batch_size=config['meta_batch_size'],
        num_inner_grad_steps=config['num_inner_grad_steps'],
        learning_rate=config['learning_rate'],
        num_ppo_steps=config['num_promp_steps'],
        clip_eps=config['clip_eps'],
        target_inner_step=config['target_inner_step'],
        init_inner_kl_penalty=config['init_inner_kl_penalty'],
        adaptive_inner_kl_penalty=config['adaptive_inner_kl_penalty'],
    )

    gpu_config = tf.ConfigProto()
    gpu_config.gpu_options.allow_growth = True  # pylint: disable=E1101
    sess = tf.Session(config=gpu_config)

    saver = tf.train.Saver(
        keep_checkpoint_every_n_hours=config['keep_checkpoint_every_n_hours'],
        max_to_keep=config['max_checkpoints_to_keep'])

    save_path = os.path.join(args.dump_path, 'model.ckpt')

    if config['restore_path'] is not None:
        logger.log('Restoring parameters from {}'.format(
            config['restore_path']))
        saver.restore(sess, config['restore_path'])
        logger.log('Restored')

    trainer = Trainer(
        algo=algo,
        policy=policy,
        env=env,
        sampler=sampler,
        sample_processor=sample_processor,
        saver=saver,
        save_path=save_path,
        save_steps=config['save_steps'],
        n_itr=config['n_itr'],
        num_inner_grad_steps=config['num_inner_grad_steps'],
        sess=sess,
    )

    trainer.train()
コード例 #9
0
def main(config):
    # config['seed'] = 4ß
    # experiment.set_name("pos task only, size = 15, logging vel")
    set_seed(config['seed'])
    # experiment.log_parameters(config)
    # experiment.log_parameter("task limit size", 3)

    # experiment.log_metric("seed", config['seed'])
    baseline = globals()[config['baseline']]()  #instantiate baseline

    env = globals()[config['env']]()  # instantiate env
    print("env: ", env.sample_tasks)
    TASKSL1 = np.array([0, -0.3])
    env.set_tasks(TASKSL1)
    env = normalize(env)  # apply normalize wrapper to env

    policy = MetaGaussianMLPPolicy(
        name="meta-policy",
        obs_dim=np.prod(env.observation_space.shape),
        action_dim=np.prod(env.action_space.shape),
        meta_batch_size=config['meta_batch_size'],
        hidden_sizes=config['hidden_sizes'],
    )

    sampler = MetaSampler(
        env=env,
        policy=policy,
        rollouts_per_meta_task=config[
            'rollouts_per_meta_task'],  # This batch_size is confusing
        meta_batch_size=config['meta_batch_size'],
        max_path_length=config['max_path_length'],
        parallel=config['parallel'],
    )

    sample_processor = MetaSampleProcessor(
        baseline=baseline,
        discount=config['discount'],
        gae_lambda=config['gae_lambda'],
        normalize_adv=config['normalize_adv'],
    )

    algo = ProMP(
        policy=policy,
        inner_lr=config['inner_lr'],
        meta_batch_size=config['meta_batch_size'],
        num_inner_grad_steps=config['num_inner_grad_steps'],
        learning_rate=config['learning_rate'],
        num_ppo_steps=config['num_promp_steps'],
        clip_eps=config['clip_eps'],
        target_inner_step=config['target_inner_step'],
        init_inner_kl_penalty=config['init_inner_kl_penalty'],
        adaptive_inner_kl_penalty=config['adaptive_inner_kl_penalty'],
    )

    trainer = Trainer(
        algo=algo,
        policy=policy,
        env=env,
        sampler=sampler,
        sample_processor=sample_processor,
        n_itr=config['n_itr'],
        num_inner_grad_steps=config['num_inner_grad_steps'],
    )

    trainer.train()
コード例 #10
0
def main(args=None):
    idx = int(time.time())
    args = parse_args(args)

    config = {
        'seed': args.seed,
        'baseline': 'LinearFeatureBaseline',
        'env': 'ReachWorld',  # not used
        'rollouts_per_meta_task': args.rollout_per_meta_task,
        'max_path_length': args.max_path_length,  # 100
        'parallel': not args.seq,
        'discount': args.discount,
        'gae_lambda': args.gae_lambda,
        'normalize_adv': True,
        'hidden_sizes': args.hidden_sizes,
        'inner_lr': args.inner_lr,  # adaptation step size
        'learning_rate': args.learning_rate,  # meta-policy gradient step size
        'num_promp_steps':
        args.num_promp_steps,  # number of ProMp steps without re-sampling
        'clip_eps': args.clip_eps,  # clipping range
        'target_inner_step': args.target_inner_step,
        'init_inner_kl_penalty': args.init_inner_kl_penalty,
        'adaptive_inner_kl_penalty': args.
        adaptive_inner_kl_penalty,  # whether to use an adaptive or fixed KL-penalty coefficient
        'n_itr': args.n_itr,  # number of overall training iterations
        'meta_batch_size':
        args.meta_batch_size,  # number of sampled meta-tasks per iterations
        'num_inner_grad_steps': args.
        num_inner_grad_steps,  # number of inner / adaptation gradient steps
    }

    # configure logger
    logger.configure(dir=args.dump_path,
                     format_strs=['stdout', 'log', 'csv'],
                     snapshot_mode='last_gap')

    # dump run configuration before starting training
    json.dump(config,
              open(args.dump_path + '/params.json', 'w'),
              cls=ClassEncoder)

    set_seed(config['seed'])

    baseline = globals()[config['baseline']]()  #instantiate baseline

    env = get_env()
    #env = normalize(env) # apply normalize wrapper to env

    if isinstance(env.action_space, gym.spaces.Box):
        action_dim = np.prod(env.action_space.shape)
    elif isinstance(env.action_space, gym.spaces.Discrete):
        action_dim = env.action_space.n
    else:
        raise Exception('unknown action space, cannot get action dim')

    policy = MetaCategoricalMLPPolicy(
        name="meta-policy",
        obs_dim=np.prod(env.observation_space.shape),
        action_dim=action_dim,
        meta_batch_size=config['meta_batch_size'],
        hidden_sizes=config['hidden_sizes'],
    )

    sampler = MetaSampler(
        env=env,
        policy=policy,
        rollouts_per_meta_task=config[
            'rollouts_per_meta_task'],  # This batch_size is confusing
        meta_batch_size=config['meta_batch_size'],
        max_path_length=config['max_path_length'],
        parallel=config['parallel'],
    )

    sample_processor = MetaSampleProcessor(
        baseline=baseline,
        discount=config['discount'],
        gae_lambda=config['gae_lambda'],
        normalize_adv=config['normalize_adv'],
    )

    algo = ProMP(
        policy=policy,
        inner_lr=config['inner_lr'],
        meta_batch_size=config['meta_batch_size'],
        num_inner_grad_steps=config['num_inner_grad_steps'],
        learning_rate=config['learning_rate'],
        num_ppo_steps=config['num_promp_steps'],
        clip_eps=config['clip_eps'],
        target_inner_step=config['target_inner_step'],
        init_inner_kl_penalty=config['init_inner_kl_penalty'],
        adaptive_inner_kl_penalty=config['adaptive_inner_kl_penalty'],
    )

    trainer = Trainer(
        algo=algo,
        policy=policy,
        env=env,
        sampler=sampler,
        sample_processor=sample_processor,
        n_itr=config['n_itr'],
        num_inner_grad_steps=config['num_inner_grad_steps'],
    )

    trainer.train()
コード例 #11
0
ファイル: pro-mp_run_humanoid2D.py プロジェクト: Neo-X/ProMP
def main(config):

    experiment.set_name("short meta saving test")
    set_seed(config['seed'])
    experiment.log_parameters(config)

    baseline = globals()[config['baseline']]()  #instantiate baseline

    env = terrainRLSim.getEnv(env_name=None, render=False)
    # env = normalize(env) # apply normalize wrapper to env

    policy = MetaGaussianMLPPolicy(
        name="meta-policy",
        obs_dim=np.prod((104, )),
        action_dim=np.prod((11, )),
        meta_batch_size=config['meta_batch_size'],
        hidden_sizes=config['hidden_sizes'],
    )

    sampler = MetaSampler(
        env=('terrianrlSim', config['env']),
        policy=policy,
        rollouts_per_meta_task=config[
            'rollouts_per_meta_task'],  # This batch_size is confusing
        meta_batch_size=config['meta_batch_size'],
        max_path_length=config['max_path_length'],
        parallel=config['parallel'],
    )
    env = terrainRLSim.getEnv(env_name=config['env'], render=False)
    # env = globals()[config['env']]() # instantiate env
    env = normalize(env)  # apply normalize wrapper to env
    print("env.observation_space.shape: ", env.observation_space.shape)
    print("env.action_space.shape: ", env.action_space.shape)
    sampler.set_env(env)

    sample_processor = MetaSampleProcessor(
        baseline=baseline,
        discount=config['discount'],
        gae_lambda=config['gae_lambda'],
        normalize_adv=config['normalize_adv'],
    )

    algo = ProMP(
        policy=policy,
        inner_lr=config['inner_lr'],
        meta_batch_size=config['meta_batch_size'],
        num_inner_grad_steps=config['num_inner_grad_steps'],
        learning_rate=config['learning_rate'],
        num_ppo_steps=config['num_promp_steps'],
        clip_eps=config['clip_eps'],
        target_inner_step=config['target_inner_step'],
        init_inner_kl_penalty=config['init_inner_kl_penalty'],
        adaptive_inner_kl_penalty=config['adaptive_inner_kl_penalty'],
    )

    trainer = Trainer(algo=algo,
                      policy=policy,
                      env=env,
                      sampler=sampler,
                      sample_processor=sample_processor,
                      n_itr=config['n_itr'],
                      num_inner_grad_steps=config['num_inner_grad_steps'],
                      experiment=experiment)

    trainer.train()
コード例 #12
0
ファイル: visualize_policy.py プロジェクト: Zber5/MMAML-rl
        obs_dim=np.prod(env.observation_space.shape),
        action_dim=np.prod(env.action_space.shape),
        meta_batch_size=params['meta_batch_size'],
        hidden_sizes=params['hidden_sizes'],
        cell_size=params['cell_size'],
        rollouts_per_meta_task=params['rollouts_per_meta_task'],
        max_path_length=params['max_path_length'],
        use_betas=params['use_betas'],
        shift_gammas=params['shift_gammas'],
    )

    sampler = MetaSampler(
        env=env,
        policy=policy,
        # This batch_size is confusing
        rollouts_per_meta_task=params['rollouts_per_meta_task'],
        meta_batch_size=params['meta_batch_size'],
        max_path_length=params['max_path_length'],
        parallel=params['parallel'],
    )

    sample_processor = MetaSampleProcessor(
        baseline=baseline,
        discount=params['discount'],
        gae_lambda=params['gae_lambda'],
        normalize_adv=params['normalize_adv'],
    )

    algo = ProMP(
        policy=policy,
        inner_lr=params['inner_lr'],