Beispiel #1
0
    def __init__(self,
                 tasks=[{
                     'goal': np.array([0, 1.0, 0.05]),
                     'height': 0.06,
                     'obj_init_pos': np.array([0, 0.6, 0.04])
                 }],
                 hand_type='weiss_v2',
                 rewMode='orig',
                 **kwargs):

        self.quick_init(locals())
        self.hand_type = hand_type
        SawyerPushEnv.__init__(self,
                               tasks=tasks,
                               hand_type=hand_type,
                               **kwargs)
        #self.hand_init_pos = [-0.00434313 , 0.76608467 , 0.26081535]
        self.demo = False
        self.max_path_length = 120
        self.camera_name = 'angled_cam'
        self.info_logKeys = ['placingDist', 'pickRew', 'reachRew', 'placeRew']
        self.rewMode = rewMode

        self.action_space = Box(
            np.array([-1, -1, -1, -1]),
            np.array([1, 1, 1, 1]),
        )
Beispiel #2
0
    def __init__(self,
                 tasks=[{
                     'goal': np.array([0, 0.7, 0.02]),
                     'height': 0.06,
                     'obj1_init_pos': np.array([0, 0.6, 0.02])
                 }],
                 liftThresh=0.04,
                 rewMode='orig',
                 **kwargs):

        self.quick_init(locals())
        SawyerPushEnv.__init__(self, tasks=tasks, **kwargs)
        self.info_logKeys = ['placingDist', 'pickRew']
        self.rewMode = rewMode
        self.heightTarget = self.objHeight + liftThresh
        self.action_space = Box(
            np.array([-1, -1, -1, -1]),
            np.array([1, 1, 1, 1]),
        )
from rllab.envs.normalized_env import normalize
from rllab.misc.instrument import stub, run_experiment_lite
from multiworld.envs.mujoco.sawyer_xyz.push.sawyer_push import SawyerPushEnv

from sandbox.rocky.tf.envs.base import TfEnv
from multiworld.core.flat_goal_env import FlatGoalEnv
from multiworld.core.finn_maml_env import FinnMamlEnv
from multiworld.core.wrapper_env import NormalizedBoxEnv

stub(globals())
rate = 0.01
mode = 'local'

import tensorflow as tf
for goal in range(1, 100):
    baseEnv = FlatGoalEnv(SawyerPushEnv(tasks=None),
                          obs_keys=['state_observation'])
    env = TfEnv(NormalizedBoxEnv(FinnMamlEnv(baseEnv, reset_mode='task')))
    #env = WheeledEnvGoal()

    env = TfEnv(env)
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_nonlinearity=tf.nn.relu,
                               hidden_sizes=(100, 100))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
Beispiel #4
0
def experiment(variant):

    seed = variant['seed']
    n_parallel = variant['n_parallel']
    log_dir = variant['log_dir']

    setup(seed, n_parallel, log_dir)
    expertDataLoc = variant['expertDataLoc']
    expertDataItr = variant['expertDataItr']

    fast_learning_rate = variant['flr']

    fast_batch_size = variant[
        'fbs']  # 10 works for [0.1, 0.2], 20 doesn't improve much for [0,0.2]
    meta_batch_size = 20  # 10 also works, but much less stable, 20 is fairly stable, 40 is more stable
    max_path_length = 150
    num_grad_updates = 1
    meta_step_size = variant['mlr']

    regionSize = variant['regionSize']

    if regionSize == '20X20':
        tasksFile = '/root/code/multiworld/multiworld/envs/goals/pickPlace_20X20_v1.pkl'

    else:
        assert regionSize == '60X30'
        tasksFile = '/root/code/multiworld/multiworld/envs/goals/PickPlace_60X30.pkl'

    tasks = pickle.load(open(tasksFile, 'rb'))
    envType = variant['envType']

    if envType == 'Push':
        baseEnv = SawyerPushEnv(tasks=tasks)
    else:
        assert (envType) == 'PickPlace'

        baseEnv = SawyerPickPlaceEnv(tasks=tasks)

    env = FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=['state_observation']))
    env = TfEnv(NormalizedBoxEnv(env))
    policy = MAMLGaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        grad_step_size=fast_learning_rate,
        hidden_nonlinearity=tf.nn.relu,
        hidden_sizes=variant['hidden_sizes'],
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = MAMLTRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=fast_batch_size,  # number of trajs for grad update
        max_path_length=max_path_length,
        meta_batch_size=meta_batch_size,
        num_grad_updates=num_grad_updates,
        n_itr=1000,
        use_maml=True,
        step_size=meta_step_size,
        plot=False,
        numExpertPolicies=20,
        expertDataInfo={
            'expert_loc': expertDataLoc,
            'expert_itr': expertDataItr
        })

    algo.train()
Beispiel #5
0
def experiment(variant, comet_exp_key=None):
    if comet_exp_key is not None:
        from rllab.misc.comet_logger import CometContinuedLogger, CometLogger
        from comet_ml import Experiment, ExistingExperiment
        # comet_log = CometContinuedLogger(api_key="KWwx7zh6I2uw6oQMkpEo3smu0", previous_experiment_key=variant['comet_exp_key'])
        comet_log = ExistingExperiment(api_key="KWwx7zh6I2uw6oQMkpEo3smu0", previous_experiment=variant['comet_exp_key'])
        # comet_log = CometLogger(api_key="KWwx7zh6I2uw6oQMkpEo3smu0",
        #                     project_name="ml4l3", workspace="glenb")
        comet_log.set_name("test seq train")
        # comet_log = comet_exp_key
        print (comet_log)
    else:
        comet_log = None
    print ("loading libraries")
    from sandbox.rocky.tf.algos.maml_il import MAMLIL

    from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
    from rllab.baselines.gaussian_mlp_baseline import GaussianMLPBaseline
    from rllab.baselines.maml_gaussian_mlp_baseline import MAMLGaussianMLPBaseline
    from rllab.baselines.zero_baseline import ZeroBaseline
    from rllab.envs.normalized_env import normalize
    from rllab.misc.instrument import stub, run_experiment_lite
    from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy import MAMLGaussianMLPPolicy as basic_policy
    # from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_adaptivestep import MAMLGaussianMLPPolicy as fullAda_basic_policy
    from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_adaptivestep_ppo import \
        MAMLGaussianMLPPolicy as PPO_policy
    from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_adaptivestep_biastransform import \
        MAMLGaussianMLPPolicy as fullAda_Bias_policy
    from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_biasonlyadaptivestep_biastransform import \
        MAMLGaussianMLPPolicy as biasAda_Bias_policy
    from sandbox.rocky.tf.policies.maml_minimal_conv_gauss_mlp_policy import MAMLGaussianMLPPolicy as conv_policy
    
    from sandbox.rocky.tf.optimizers.quad_dist_expert_optimizer import QuadDistExpertOptimizer
    from sandbox.rocky.tf.optimizers.first_order_optimizer import FirstOrderOptimizer
    from sandbox.rocky.tf.envs.base import TfEnv
    import sandbox.rocky.tf.core.layers as L
    
    from rllab.envs.mujoco.ant_env_rand_goal_ring import AntEnvRandGoalRing
    from multiworld.envs.mujoco.sawyer_xyz.push.sawyer_push import SawyerPushEnv
    from multiworld.envs.mujoco.sawyer_xyz.pickPlace.sawyer_pick_and_place import SawyerPickPlaceEnv
    from multiworld.envs.mujoco.sawyer_xyz.door.sawyer_door_open import SawyerDoorOpenEnv
    from multiworld.core.flat_goal_env import FlatGoalEnv
    from multiworld.core.finn_maml_env import FinnMamlEnv
    from multiworld.core.wrapper_env import NormalizedBoxEnv
    
    import tensorflow as tf
    import time
    from rllab.envs.gym_env import GymEnv
    
    from maml_examples.maml_experiment_vars import MOD_FUNC
    import numpy as np
    import random as rd
    import pickle
    
    print ("Done loading libraries")
    
    seed = variant['seed'];
    n_parallel = 1;
    log_dir = variant['log_dir']

    x=0
    setup(seed, n_parallel, log_dir)
    fast_batch_size = variant['fbs'];
    meta_batch_size = variant['mbs']
    adam_steps = variant['adam_steps'];
    max_path_length = variant['max_path_length']
    dagger = variant['dagger'];
    expert_policy_loc = variant['expert_policy_loc']
    ldim = variant['ldim'];
    init_flr = variant['init_flr'];
    policyType = variant['policyType'];
    use_maesn = variant['use_maesn']
    EXPERT_TRAJ_LOCATION = variant['expertDataLoc']
    envType = variant['envType']
    tasksFile = path_to_multiworld + 'multiworld/envs/goals/' + variant['tasksFile'] + '.pkl'
    all_tasks = pickle.load(open(tasksFile, 'rb'))
    assert meta_batch_size <= len(all_tasks), "meta batch size wrong: " + str(meta_batch_size) + " <= " + str(len(all_tasks))
    tasks = all_tasks[:meta_batch_size]
    print("^^^^^^^^^^^^^^^^ meta_tasks: ", tasks, " ^^^^^^^^^^^^^^^^ ")

    use_images = 'conv' in policyType

    if 'Push' == envType:
        baseEnv = SawyerPushEnv(tasks=tasks, image=use_images, mpl=max_path_length)

    elif envType == 'sparsePush':
        baseEnv = SawyerPushEnv(tasks=tasks, image=use_images, mpl=max_path_length, rewMode='l2Sparse')


    elif 'PickPlace' in envType:
        baseEnv = SawyerPickPlaceEnv(tasks=tasks, image=use_images, mpl=max_path_length)

    elif 'Door' in envType:
        baseEnv = SawyerDoorOpenEnv(tasks=tasks, image=use_images, mpl=max_path_length)

    elif 'Ant' in envType:
        env = TfEnv(normalize(AntEnvRandGoalRing()))

    elif 'claw' in envType:
        env = TfEnv(DClawScrewRandGoal())

    else:
        assert True == False

    if envType in ['Push', 'PickPlace', 'Door']:
        if use_images:
            obs_keys = ['img_observation']
        else:
            obs_keys = ['state_observation']
        env = TfEnv(NormalizedBoxEnv(FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=obs_keys), reset_mode='idx')))

    algoClass = MAMLIL
    baseline = LinearFeatureBaseline(env_spec=env.spec)

    load_policy = variant['load_policy']

    if load_policy != None:
        policy = None
        load_policy = variant['load_policy']
        # if 'conv' in load_policy:
        #     baseline = ZeroBaseline(env_spec=env.spec)

    elif 'fullAda_PPO' in policyType:

        policy = PPO_policy(
            name="policy",
            env_spec=env.spec,
            grad_step_size=init_flr,
            hidden_nonlinearity=tf.nn.relu,
            hidden_sizes=(100, 100),
            init_flr_full=init_flr,
            latent_dim=ldim
        )
        
    elif 'fullAda_Bias' in policyType:

        policy = fullAda_Bias_policy(
            name="policy",
            env_spec=env.spec,
            grad_step_size=init_flr,
            hidden_nonlinearity=tf.nn.relu,
            hidden_sizes=(100, 100),
            init_flr_full=init_flr,
            latent_dim=ldim
        )

    elif 'biasAda_Bias' in policyType:

        policy = biasAda_Bias_policy(
            name="policy",
            env_spec=env.spec,
            grad_step_size=init_flr,
            hidden_nonlinearity=tf.nn.relu,
            hidden_sizes=(100, 100),
            init_flr_full=init_flr,
            latent_dim=ldim
        )

    elif 'basic' in policyType:
        policy = basic_policy(
            name="policy",
            env_spec=env.spec,
            grad_step_size=init_flr,
            hidden_nonlinearity=tf.nn.relu,
            hidden_sizes=(100, 100),
            extra_input_dim=(0 if extra_input is "" else extra_input_dim),
        )


    elif 'conv' in policyType:

        baseline = ZeroBaseline(env_spec=env.spec)

        policy = conv_policy(
            name="policy",
            latent_dim=ldim,
            policyType=policyType,
            env_spec=env.spec,
            init_flr=init_flr,

            hidden_nonlinearity=tf.nn.relu,
            hidden_sizes=(100, 100),
            extra_input_dim=(0 if extra_input is "" else extra_input_dim),
        )

    print("|||||||||||||||||||||||||||||||||||||||||||||||", variant['n_itr'])
    
    beta_steps = 1 ;
    meta_step_size = 0.01 ; num_grad_updates = 1
    pre_std_modifier = 1.0 ; post_std_modifier = 0.00001 
    limit_demos_num = None 

    algo = algoClass(
        env=env,
        policy=policy,
        load_policy=load_policy,
        baseline=baseline,
        batch_size=fast_batch_size,  # number of trajs for alpha grad update
        max_path_length=max_path_length,
        meta_batch_size=meta_batch_size,  # number of tasks sampled for beta grad update
        num_grad_updates=num_grad_updates,  # number of alpha grad updates
        n_itr=variant['n_itr'],
        make_video=False,
        use_maml=True,
        use_pooled_goals=True,
        use_corr_term=use_corr_term,
        test_on_training_goals=test_on_training_goals,
        metalearn_baseline=False,
        # metalearn_baseline=False,
        limit_demos_num=limit_demos_num,
        test_goals_mult=1,
        step_size=meta_step_size,
        plot=False,
        beta_steps=beta_steps,
        adam_curve=None,
        adam_steps=adam_steps,
        pre_std_modifier=pre_std_modifier,
        l2loss_std_mult=l2loss_std_mult,
        importance_sampling_modifier=MOD_FUNC[''],
        post_std_modifier=post_std_modifier,
        expert_trajs_dir=EXPERT_TRAJ_LOCATION,
        expert_trajs_suffix='',
        seed=seed,
        extra_input=extra_input,
        extra_input_dim=(0 if extra_input is "" else extra_input_dim),
        plotDirPrefix=None,
        latent_dim=ldim,
        dagger=dagger,
        expert_policy_loc=expert_policy_loc,
        comet_logger=comet_log,
        outerIteration=variant['outer_Iteration'],
        use_ppo=True
    )

    algo.train()
                        for limit_demos_num in limit_demos_num_list:
                            for l2loss_std_mult in l2loss_std_mult_list:
                                for post_std_modifier_train in post_std_modifier_train_list:
                                    for post_std_modifier_test in post_std_modifier_test_list:
                                        for pre_std_modifier in pre_std_modifier_list:
                                            for fast_learning_rate in fast_learning_rates:
                                                for beta_steps, adam_steps in beta_adam_steps_list:
                                                    for bas in baselines:
                                                        stub(globals())
                                                        tf.set_random_seed(
                                                            seed)
                                                        np.random.seed(seed)
                                                        rd.seed(seed)

                                                        baseEnv = FlatGoalEnv(
                                                            SawyerPushEnv(
                                                                tasks=None),
                                                            obs_keys=[
                                                                'state_observation'
                                                            ])
                                                        env = TfEnv(
                                                            NormalizedBoxEnv(
                                                                FinnMamlEnv(
                                                                    baseEnv,
                                                                    reset_mode=
                                                                    'task')))

                                                        exp_name = str(
                                                            'R7_IL'
                                                            # +time.strftime("%D").replace("/", "")[0:4]
                                                            + goals_suffix +
                                                            "_" + str(seed)
Beispiel #7
0
 for l2loss_std_mult in l2loss_std_mult_list:
     for post_std_modifier_train in post_std_modifier_train_list:
         for post_std_modifier_test in post_std_modifier_test_list:
             for pre_std_modifier in pre_std_modifier_list:
                 for beta_steps, adam_steps in beta_adam_steps_list:
                     for adam_curve in adam_curves:
                         for bas in baselines:
                             stub(globals())
                             tf.set_random_seed(
                                 seed)
                             np.random.seed(
                                 seed)
                             rd.seed(seed)
                             baseEnv = FlatGoalEnv(
                                 SawyerPushEnv(
                                     tasks=
                                     tasks),
                                 obs_keys=[
                                     'state_observation'
                                 ])
                             env = TfEnv(
                                 NormalizedBoxEnv(
                                     FinnMamlEnv(
                                         baseEnv,
                                         reset_mode
                                         ='task'
                                     )))
                             # env = TfEnv(normalize(Reacher7DofMultitaskEnv()))
                             exp_name = str(
                                 'PU_IL'
                                 # +time.strftime("%D").replace("/", "")[0:4]
Beispiel #8
0
def experiment(variant, comet_exp_key=None):
    comet_logger = None
    if comet_exp_key is not None:
        # from rllab.misc.comet_logger import CometContinuedLogger, CometLogger
        # from comet_ml import Experiment, ExistingExperiment
        # comet_log = CometContinuedLogger(api_key="KWwx7zh6I2uw6oQMkpEo3smu0", previous_experiment_key=variant['comet_exp_key'])
        comet_logger = ExistingExperiment(
            api_key="KWwx7zh6I2uw6oQMkpEo3smu0",
            previous_experiment=variant['comet_exp_key'])
        # comet_log = CometLogger(api_key="KWwx7zh6I2uw6oQMkpEo3smu0",
        #                     project_name="ml4l3", workspace="glenb")
        comet_logger.set_name("test seq train")
        # comet_log = comet_exp_key
        print("RL!: ", comet_logger)
    print("%%%%%%%%%%%%%%%%%", comet_logger)
    seed = variant['seed']
    log_dir = variant['log_dir']
    n_parallel = variant['n_parallel']

    setup(seed, n_parallel, log_dir)

    init_file = variant['init_file']
    taskIndex = variant['taskIndex']
    n_itr = variant['n_itr']
    default_step = variant['default_step']
    policyType = variant['policyType']
    envType = variant['envType']

    tasksFile = path_to_multiworld + '/multiworld/envs/goals/' + variant[
        'tasksFile'] + '.pkl'
    tasks = pickle.load(open(tasksFile, 'rb'))

    max_path_length = variant['max_path_length']

    use_images = 'conv' in policyType
    print("$$$$$$$$$$$$$$$ RL-TASK: ", str(tasks[taskIndex]),
          " $$$$$$$$$$$$$$$")
    if 'MultiDomain' in envType:
        baseEnv = Sawyer_MultiDomainEnv(tasks=tasks,
                                        image=use_images,
                                        mpl=max_path_length)

    elif 'Push' in envType:
        baseEnv = SawyerPushEnv(tasks=tasks,
                                image=use_images,
                                mpl=max_path_length)

    elif 'PickPlace' in envType:
        baseEnv = SawyerPickPlaceEnv(tasks=tasks,
                                     image=use_images,
                                     mpl=max_path_length)

    elif 'Door' in envType:
        baseEnv = SawyerDoorOpenEnv(tasks=tasks,
                                    image=use_images,
                                    mpl=max_path_length)

    elif 'Ant' in envType:
        env = TfEnv(normalize(AntEnvRandGoalRing()))

    elif 'Coffee' in envType:
        baseEnv = SawyerCoffeeEnv(mpl=max_path_length)

    else:
        raise AssertionError('')

    if envType in ['Push', 'PickPlace', 'Door']:
        if use_images:
            obs_keys = ['img_observation']
        else:
            obs_keys = ['state_observation']
        env = TfEnv(
            NormalizedBoxEnv(
                FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=obs_keys),
                            reset_mode='idx')))

    baseline = ZeroBaseline(env_spec=env.spec)
    # baseline = LinearFeatureBaseline(env_spec = env.spec)
    batch_size = variant['batch_size']

    if policyType == 'fullAda_Bias':

        baseline = LinearFeatureBaseline(env_spec=env.spec)
        algo = vpg_fullADA(
            env=env,
            policy=None,
            load_policy=init_file,
            baseline=baseline,
            batch_size=batch_size,  # 2x
            max_path_length=max_path_length,
            n_itr=n_itr,
            # noise_opt = True,
            default_step=default_step,
            sampler_cls=VectorizedSampler,  # added by RK 6/19
            sampler_args=dict(n_envs=1),

            # reset_arg=np.asscalar(taskIndex),
            reset_arg=taskIndex,
            log_dir=log_dir,
            comet_logger=comet_logger,
            outer_iteration=variant['outer_iteration'])

    elif policyType == 'biasAda_Bias':

        algo = vpg_biasADA(
            env=env,
            policy=None,
            load_policy=init_file,
            baseline=baseline,
            batch_size=batch_size,  # 2x
            max_path_length=max_path_length,
            n_itr=n_itr,
            # noise_opt = True,
            default_step=default_step,
            sampler_cls=VectorizedSampler,  # added by RK 6/19
            sampler_args=dict(n_envs=1),
            # reset_arg=np.asscalar(taskIndex),
            reset_arg=taskIndex,
            log_dir=log_dir)

    elif policyType == 'basic':

        algo = vpg_basic(
            env=env,
            policy=None,
            load_policy=init_file,
            baseline=baseline,
            batch_size=batch_size,
            max_path_length=max_path_length,
            n_itr=n_itr,
            # step_size=10.0,
            sampler_cls=VectorizedSampler,  # added by RK 6/19
            sampler_args=dict(n_envs=1),
            reset_arg=taskIndex,
            optimizer=None,
            optimizer_args={
                'init_learning_rate': default_step,
                'tf_optimizer_args': {
                    'learning_rate': 0.5 * default_step
                },
                'tf_optimizer_cls': tf.train.GradientDescentOptimizer
            },
            log_dir=log_dir
            # extra_input="onehot_exploration", # added by RK 6/19
            # extra_input_dim=5, # added by RK 6/19
        )

    elif 'conv' in policyType:

        algo = vpg_conv(
            env=env,
            policy=None,
            load_policy=init_file,
            baseline=baseline,
            batch_size=batch_size,  # 2x
            max_path_length=max_path_length,
            n_itr=n_itr,
            sampler_cls=VectorizedSampler,  # added by RK 6/19
            sampler_args=dict(n_envs=1),
            # noise_opt = True,
            default_step=default_step,
            # reset_arg=np.asscalar(taskIndex),
            reset_arg=taskIndex,
            log_dir=log_dir)

    else:
        raise AssertionError(
            'Policy Type must be fullAda_Bias or biasAda_Bias')

    algo.train()
Beispiel #9
0
def experiment(variant):

    seed = variant['seed']
    log_dir = variant['log_dir']
    n_parallel = variant['n_parallel']

    setup(seed, n_parallel, log_dir)

    init_file = variant['init_file']
    taskIndex = variant['taskIndex']
    n_itr = variant['n_itr']
    default_step = variant['default_step']
    policyType = variant['policyType']
    envType = variant['envType']

    tasksFile = path_to_multiworld + '/multiworld/envs/goals/' + variant[
        'tasksFile'] + '.pkl'
    tasks = pickle.load(open(tasksFile, 'rb'))

    max_path_length = variant['max_path_length']

    use_images = 'conv' in policyType

    if 'MultiDomain' in envType:
        baseEnv = Sawyer_MultiDomainEnv(tasks=tasks,
                                        image=use_images,
                                        mpl=max_path_length)

    elif 'Push' in envType:
        baseEnv = SawyerPushEnv(tasks=tasks,
                                image=use_images,
                                mpl=max_path_length)

    elif 'PickPlace' in envType:
        baseEnv = SawyerPickPlaceEnv(tasks=tasks,
                                     image=use_images,
                                     mpl=max_path_length)

    elif 'Door' in envType:
        baseEnv = SawyerDoorOpenEnv(tasks=tasks,
                                    image=use_images,
                                    mpl=max_path_length)

    elif 'Ant' in envType:
        env = TfEnv(normalize(AntEnvRandGoalRing()))

    elif 'Coffee' in envType:
        baseEnv = SawyerCoffeeEnv(mpl=max_path_length)

    else:
        raise AssertionError('')

    if envType in ['Push', 'PickPlace', 'Door']:
        if use_images:
            obs_keys = ['img_observation']
        else:
            obs_keys = ['state_observation']
        env = TfEnv(
            NormalizedBoxEnv(
                FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=obs_keys),
                            reset_mode='idx')))

    baseline = ZeroBaseline(env_spec=env.spec)
    #baseline = LinearFeatureBaseline(env_spec = env.spec)
    batch_size = variant['batch_size']

    if policyType == 'fullAda_Bias':

        baseline = LinearFeatureBaseline(env_spec=env.spec)
        algo = vpg_fullADA(
            env=env,
            policy=None,
            load_policy=init_file,
            baseline=baseline,
            batch_size=batch_size,  # 2x
            max_path_length=max_path_length,
            n_itr=n_itr,
            #noise_opt = True,
            default_step=default_step,
            sampler_cls=VectorizedSampler,  # added by RK 6/19
            sampler_args=dict(n_envs=1),

            #reset_arg=np.asscalar(taskIndex),
            reset_arg=taskIndex,
            log_dir=log_dir)

    elif policyType == 'biasAda_Bias':

        algo = vpg_biasADA(
            env=env,
            policy=None,
            load_policy=init_file,
            baseline=baseline,
            batch_size=batch_size,  # 2x
            max_path_length=max_path_length,
            n_itr=n_itr,
            #noise_opt = True,
            default_step=default_step,
            sampler_cls=VectorizedSampler,  # added by RK 6/19
            sampler_args=dict(n_envs=1),
            #reset_arg=np.asscalar(taskIndex),
            reset_arg=taskIndex,
            log_dir=log_dir)

    elif policyType == 'basic':

        algo = vpg_basic(
            env=env,
            policy=None,
            load_policy=init_file,
            baseline=baseline,
            batch_size=batch_size,
            max_path_length=max_path_length,
            n_itr=n_itr,
            #step_size=10.0,
            sampler_cls=VectorizedSampler,  # added by RK 6/19
            sampler_args=dict(n_envs=1),
            reset_arg=taskIndex,
            optimizer=None,
            optimizer_args={
                'init_learning_rate': default_step,
                'tf_optimizer_args': {
                    'learning_rate': 0.5 * default_step
                },
                'tf_optimizer_cls': tf.train.GradientDescentOptimizer
            },
            log_dir=log_dir
            # extra_input="onehot_exploration", # added by RK 6/19
            # extra_input_dim=5, # added by RK 6/19
        )

    elif 'conv' in policyType:

        algo = vpg_conv(
            env=env,
            policy=None,
            load_policy=init_file,
            baseline=baseline,
            batch_size=batch_size,  # 2x
            max_path_length=max_path_length,
            n_itr=n_itr,
            sampler_cls=VectorizedSampler,  # added by RK 6/19
            sampler_args=dict(n_envs=1),
            #noise_opt = True,
            default_step=default_step,
            #reset_arg=np.asscalar(taskIndex),
            reset_arg=taskIndex,
            log_dir=log_dir)

    else:
        raise AssertionError(
            'Policy Type must be fullAda_Bias or biasAda_Bias')

    algo.train()
Beispiel #10
0
 def render(self):
     SawyerEnv.render(self)
Beispiel #11
0
def experiment(variant):


  seed = variant['seed'] ; n_parallel = variant['n_parallel'] ; log_dir = variant['log_dir']
  setup(seed, n_parallel, log_dir)
  fast_learning_rate = variant['flr'] ; fast_batch_size = variant['fbs'] ; meta_batch_size = variant['mbs']
  envClass = variant['envClass']

  beta_steps = 1
  adam_steps = variant['adam_steps']
  updateMode = 'vec'
  adam_curve = None
  
  env_option = ''
  
  extra_input = "onehot_exploration" # "onehot_exploration" "gaussian_exploration"
  # extra_input = None
  extra_input_dim = 5

  
  num_grad_updates = 1
  meta_step_size = 0.01
  pre_std_modifier = 1.0
  post_std_modifier_train = 0.00001
  post_std_modifier_test = 0.00001
  l2loss_std_mult = 1.0
  ism = ''
 
  limit_demos_num = 40  # 40
  test_goals_mult = 1
  bas_lr = 0.01 # baseline learning rate
  momentum=0.5
  bas_hnl = tf.nn.relu
  hidden_layers = (100,100)

  basas = 60 # baseline adam steps
  use_corr_term = True
  # seeds = [1,2,3,4,5,6,7]  #,2,3,4,5,6,7,8] #, 2,3,4,5,6,7,8]
  
  use_maml = True
  test_on_training_goals = False
  env = None
  
  if envClass == 'Ant':
    env = TfEnv(normalize(AntEnvRandGoalRing())) 
    max_path_length = 200  
    EXPERT_TRAJ_LOCATION_DICT = '/root/code/rllab/saved_expert_traj/Expert_trajs_dense_ant/'

  elif envClass == 'SawyerPusher':
   
    baseEnv = FlatGoalEnv(SawyerPushEnv(tasks=None), obs_keys=['state_observation'])
    env = TfEnv(NormalizedBoxEnv(FinnMamlEnv( baseEnv , reset_mode = 'task')))
    max_path_length = 150
    EXPERT_TRAJ_LOCATION_DICT = '/root/code/maml_gps/saved_expert_traj/Expert_trajs_sawyer_pusher/'

  else:
    raise AssertionError('Env must be either Ant or SawyerPusher')

 
  policy = MAMLGaussianMLPPolicy(
      name="policy",
      env_spec=env.spec,
      grad_step_size=fast_learning_rate,
      hidden_nonlinearity=tf.nn.relu,
      hidden_sizes=(100, 100),
      std_modifier=pre_std_modifier,
      # metalearn_baseline=(bas == "MAMLGaussianMLP"),
      extra_input_dim=(0 if extra_input is None else extra_input_dim),
      updateMode = updateMode,
      num_tasks = meta_batch_size
  )
 
  
  baseline = LinearFeatureBaseline(env_spec=env.spec)
 
  algo = MAMLIL(
      env=env,
      policy=policy,
      #policy=None,
      #oad_policy='/home/alvin/maml_rl/data/local/R7-IL-0918/R7_IL_200_40_1_1_dem40_ei5_as50_basl_1809_04_27/itr_24.pkl',
      baseline=baseline,
      batch_size=fast_batch_size,  # number of trajs for alpha grad update
      max_path_length=max_path_length,
      meta_batch_size=meta_batch_size,  # number of tasks sampled for beta grad update
      num_grad_updates=num_grad_updates,  # number of alpha grad updates
      n_itr=200, #100
      make_video=False,
      use_maml=use_maml,
      use_pooled_goals=True,
      use_corr_term=use_corr_term,
      test_on_training_goals=test_on_training_goals,
      metalearn_baseline=False,
      # metalearn_baseline=False,
      limit_demos_num=limit_demos_num,
      test_goals_mult=test_goals_mult,
      step_size=meta_step_size,
      plot=False,
      beta_steps=beta_steps,
      adam_curve=adam_curve,
      adam_steps=adam_steps,
      pre_std_modifier=pre_std_modifier,
      l2loss_std_mult=l2loss_std_mult,
      importance_sampling_modifier=MOD_FUNC[ism],
      post_std_modifier_train=post_std_modifier_train,
      post_std_modifier_test=post_std_modifier_test,
      expert_trajs_dir=EXPERT_TRAJ_LOCATION_DICT,
      #[env_option+"."+mode+goals_suffix],
      expert_trajs_suffix="",
      seed=seed,
      extra_input=extra_input,
      extra_input_dim=(0 if extra_input is None else extra_input_dim),
      updateMode = updateMode
  )
  algo.train()
Beispiel #12
0
def experiment(variant):

    seed = variant['seed']
    tf.set_random_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    initial_params_file = variant['initial_params_file']
    goalIndex = variant['goalIndex']
    init_step_size = variant['init_step_size']

    regionSize = variant['regionSize']

    mode = variant['mode']

    if 'docker' in mode:
        taskFilePrefix = '/root/code'
    else:
        taskFilePrefix = '/home/russellm'

    if variant['valRegionSize'] != None:
        valRegionSize = variant['valRegionSize']

        tasksFile = taskFilePrefix + '/multiworld/multiworld/envs/goals/pickPlace_' + valRegionSize + '_val.pkl'

    else:
        tasksFile = taskFilePrefix + '/multiworld/multiworld/envs/goals/pickPlace_' + regionSize + '.pkl'

    tasks = pickle.load(open(tasksFile, 'rb'))

    envType = variant['envType']
    if envType == 'Push':
        baseEnv = SawyerPushEnv(tasks=tasks)
    else:
        assert (envType) == 'PickPlace'
        baseEnv = SawyerPickPlaceEnv(tasks=tasks)

    env = FinnMamlEnv(
        FlatGoalEnv(baseEnv,
                    obs_keys=['state_observation', 'state_desired_goal']))
    env = TfEnv(NormalizedBoxEnv(env))
    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = VPG(
        env=env,
        policy=None,
        load_policy=initial_params_file,
        baseline=baseline,
        batch_size=7500,  # 2x
        max_path_length=150,
        n_itr=10,
        reset_arg=goalIndex,
        optimizer_args={
            'init_learning_rate': init_step_size,
            'tf_optimizer_args': {
                'learning_rate': 0.1 * init_step_size
            },
            'tf_optimizer_cls': tf.train.GradientDescentOptimizer
        })
    import os
    saveDir = variant['saveDir']
    currPath = ''
    for _dir in saveDir.split('/'):
        currPath += _dir + '/'
        if os.path.isdir(currPath) == False:
            os.mkdir(currPath)

    logger.set_snapshot_dir(saveDir)
    logger.add_tabular_output(saveDir + 'progress.csv')
    algo.train()
Beispiel #13
0
def experiment(variant, comet_logger=comet_logger):

    from sandbox.rocky.tf.algos.maml_il import MAMLIL
    from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
    from rllab.baselines.gaussian_mlp_baseline import GaussianMLPBaseline
    from rllab.baselines.maml_gaussian_mlp_baseline import MAMLGaussianMLPBaseline
    from rllab.baselines.zero_baseline import ZeroBaseline
    from rllab.envs.normalized_env import normalize
    from rllab.misc.instrument import stub, run_experiment_lite
    from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy import MAMLGaussianMLPPolicy as basic_policy
    #from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_adaptivestep import MAMLGaussianMLPPolicy as fullAda_basic_policy
    from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_adaptivestep_biastransform import MAMLGaussianMLPPolicy as fullAda_Bias_policy
    from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_biasonlyadaptivestep_biastransform import MAMLGaussianMLPPolicy as biasAda_Bias_policy
    from sandbox.rocky.tf.policies.maml_minimal_conv_gauss_mlp_policy import MAMLGaussianMLPPolicy as conv_policy

    from sandbox.rocky.tf.optimizers.quad_dist_expert_optimizer import QuadDistExpertOptimizer
    from sandbox.rocky.tf.optimizers.first_order_optimizer import FirstOrderOptimizer
    from sandbox.rocky.tf.envs.base import TfEnv
    import sandbox.rocky.tf.core.layers as L

    from rllab.envs.mujoco.ant_env_rand_goal_ring import AntEnvRandGoalRing
    from multiworld.envs.mujoco.sawyer_xyz.push.sawyer_push import SawyerPushEnv
    from multiworld.envs.mujoco.sawyer_xyz.pickPlace.sawyer_pick_and_place import SawyerPickPlaceEnv
    from multiworld.envs.mujoco.sawyer_xyz.door.sawyer_door_open import SawyerDoorOpenEnv
    from multiworld.core.flat_goal_env import FlatGoalEnv
    from multiworld.core.finn_maml_env import FinnMamlEnv
    from multiworld.core.wrapper_env import NormalizedBoxEnv

    import tensorflow as tf
    import time
    from rllab.envs.gym_env import GymEnv

    from maml_examples.maml_experiment_vars import MOD_FUNC
    import numpy as np
    import random as rd
    import pickle

    import rllab.misc.logger as logger
    from rllab.misc.ext import set_seed
    import os

    seed = variant['seed']
    n_parallel = 1
    log_dir = variant['log_dir']

    def setup(seed, n_parallel, log_dir):

        if seed is not None:
            set_seed(seed)

        if n_parallel > 0:
            from rllab.sampler import parallel_sampler
            parallel_sampler.initialize(n_parallel=n_parallel)
            if seed is not None:
                parallel_sampler.set_seed(seed)

        if os.path.isdir(log_dir) == False:
            os.makedirs(log_dir, exist_ok=True)

        logger.set_snapshot_dir(log_dir)
        logger.add_tabular_output(log_dir + '/progress.csv')

    setup(seed, n_parallel, log_dir)

    fast_batch_size = variant['fbs']
    meta_batch_size = variant['mbs']
    adam_steps = variant['adam_steps']
    max_path_length = variant['max_path_length']

    dagger = variant['dagger']
    expert_policy_loc = variant['expert_policy_loc']

    ldim = variant['ldim']
    init_flr = variant['init_flr']
    policyType = variant['policyType']
    use_maesn = variant['use_maesn']
    EXPERT_TRAJ_LOCATION = variant['expertDataLoc']
    envType = variant['envType']

    tasksFile = path_to_multiworld + 'multiworld/envs/goals/' + variant[
        'tasksFile'] + '.pkl'

    all_tasks = pickle.load(open(tasksFile, 'rb'))
    assert meta_batch_size <= len(all_tasks)
    tasks = all_tasks[:meta_batch_size]

    use_images = 'conv' in policyType

    if 'Push' == envType:
        baseEnv = SawyerPushEnv(tasks=tasks,
                                image=use_images,
                                mpl=max_path_length)

    elif envType == 'sparsePush':
        baseEnv = SawyerPushEnv(tasks=tasks,
                                image=use_images,
                                mpl=max_path_length,
                                rewMode='l2Sparse')

    elif 'PickPlace' in envType:
        baseEnv = SawyerPickPlaceEnv(tasks=tasks,
                                     image=use_images,
                                     mpl=max_path_length)

    elif 'Door' in envType:
        baseEnv = SawyerDoorOpenEnv(tasks=tasks,
                                    image=use_images,
                                    mpl=max_path_length)

    elif 'Ant' in envType:
        env = TfEnv(normalize(AntEnvRandGoalRing()))

    elif 'claw' in envType:
        env = TfEnv(DClawScrewRandGoal())

    else:
        assert True == False

    if envType in ['Push', 'PickPlace', 'Door']:
        if use_images:
            obs_keys = ['img_observation']
        else:
            obs_keys = ['state_observation']
        env = TfEnv(
            NormalizedBoxEnv(
                FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=obs_keys),
                            reset_mode='idx')))

    algoClass = MAMLIL
    baseline = LinearFeatureBaseline(env_spec=env.spec)

    load_policy = variant['load_policy']

    if load_policy != None:
        policy = None
        load_policy = variant['load_policy']
        # if 'conv' in load_policy:
        #     baseline = ZeroBaseline(env_spec=env.spec)

    elif 'fullAda_Bias' in policyType:

        policy = fullAda_Bias_policy(name="policy",
                                     env_spec=env.spec,
                                     grad_step_size=init_flr,
                                     hidden_nonlinearity=tf.nn.relu,
                                     hidden_sizes=(100, 100),
                                     init_flr_full=init_flr,
                                     latent_dim=ldim)

    elif 'biasAda_Bias' in policyType:

        policy = biasAda_Bias_policy(name="policy",
                                     env_spec=env.spec,
                                     grad_step_size=init_flr,
                                     hidden_nonlinearity=tf.nn.relu,
                                     hidden_sizes=(100, 100),
                                     init_flr_full=init_flr,
                                     latent_dim=ldim)

    elif 'basic' in policyType:
        policy = basic_policy(
            name="policy",
            env_spec=env.spec,
            grad_step_size=init_flr,
            hidden_nonlinearity=tf.nn.relu,
            hidden_sizes=(100, 100),
            extra_input_dim=(0 if extra_input is "" else extra_input_dim),
        )

    elif 'conv' in policyType:

        baseline = ZeroBaseline(env_spec=env.spec)

        policy = conv_policy(
            name="policy",
            latent_dim=ldim,
            policyType=policyType,
            env_spec=env.spec,
            init_flr=init_flr,
            hidden_nonlinearity=tf.nn.relu,
            hidden_sizes=(100, 100),
            extra_input_dim=(0 if extra_input is "" else extra_input_dim),
        )

    algo = algoClass(
        env=env,
        policy=policy,
        load_policy=load_policy,
        baseline=baseline,
        batch_size=fast_batch_size,  # number of trajs for alpha grad update
        max_path_length=max_path_length,
        meta_batch_size=
        meta_batch_size,  # number of tasks sampled for beta grad update
        num_grad_updates=num_grad_updates,  # number of alpha grad updates
        n_itr=variant['iterations'],
        make_video=False,
        use_maml=True,
        use_pooled_goals=True,
        use_corr_term=use_corr_term,
        test_on_training_goals=test_on_training_goals,
        metalearn_baseline=False,
        # metalearn_baseline=False,
        limit_demos_num=limit_demos_num,
        test_goals_mult=1,
        step_size=meta_step_size,
        plot=False,
        beta_steps=beta_steps,
        adam_curve=None,
        adam_steps=adam_steps,
        pre_std_modifier=pre_std_modifier,
        l2loss_std_mult=l2loss_std_mult,
        importance_sampling_modifier=MOD_FUNC[''],
        post_std_modifier=post_std_modifier,
        expert_trajs_dir=EXPERT_TRAJ_LOCATION,
        expert_trajs_suffix='',
        seed=seed,
        extra_input=extra_input,
        extra_input_dim=(0 if extra_input is "" else extra_input_dim),
        plotDirPrefix=None,
        latent_dim=ldim,
        dagger=dagger,
        expert_policy_loc=expert_policy_loc,
        comet_logger=comet_logger)

    algo.train()
    tf.reset_default_graph()
Beispiel #14
0
def experiment(variant):

    seed = variant['seed']
    n_parallel = 1
    log_dir = variant['log_dir']

    setup(seed, n_parallel, log_dir)

    fast_batch_size = variant['fbs']
    meta_batch_size = variant['mbs']
    adam_steps = variant['adam_steps']
    max_path_length = variant['max_path_length']

    dagger = variant['dagger']
    expert_policy_loc = variant['expert_policy_loc']

    ldim = variant['ldim']
    init_flr = variant['init_flr']
    policyType = variant['policyType']
    use_maesn = variant['use_maesn']
    EXPERT_TRAJ_LOCATION = variant['expertDataLoc']
    envType = variant['envType']

    tasksFile = path_to_multiworld + 'multiworld/envs/goals/' + variant[
        'tasksFile'] + '.pkl'

    all_tasks = pickle.load(open(tasksFile, 'rb'))
    assert meta_batch_size <= len(all_tasks)
    tasks = all_tasks[:meta_batch_size]

    use_images = 'conv' in policyType

    if 'Push' == envType:
        baseEnv = SawyerPushEnv(tasks=tasks,
                                image=use_images,
                                mpl=max_path_length)

    elif envType == 'sparsePush':
        baseEnv = SawyerPushEnv(tasks=tasks,
                                image=use_images,
                                mpl=max_path_length,
                                rewMode='l2Sparse')

    elif 'PickPlace' in envType:
        baseEnv = SawyerPickPlaceEnv(tasks=tasks,
                                     image=use_images,
                                     mpl=max_path_length)

    elif 'Door' in envType:
        baseEnv = SawyerDoorOpenEnv(tasks=tasks,
                                    image=use_images,
                                    mpl=max_path_length)

    elif 'Ant' in envType:
        env = TfEnv(normalize(AntEnvRandGoalRing()))

    elif 'claw' in envType:
        env = TfEnv(DClawScrewRandGoal())

    else:
        assert True == False

    if envType in ['Push', 'PickPlace', 'Door']:
        if use_images:
            obs_keys = ['img_observation']
        else:
            obs_keys = ['state_observation']
        env = TfEnv(
            NormalizedBoxEnv(
                FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=obs_keys),
                            reset_mode='idx')))

    algoClass = MAMLIL
    baseline = LinearFeatureBaseline(env_spec=env.spec)

    load_policy = variant['load_policy']

    if load_policy != None:
        policy = None
        load_policy = variant['load_policy']
        # if 'conv' in load_policy:
        #     baseline = ZeroBaseline(env_spec=env.spec)

    elif 'fullAda_Bias' in policyType:

        policy = fullAda_Bias_policy(name="policy",
                                     env_spec=env.spec,
                                     grad_step_size=init_flr,
                                     hidden_nonlinearity=tf.nn.relu,
                                     hidden_sizes=(100, 100),
                                     init_flr_full=init_flr,
                                     latent_dim=ldim)

    elif 'biasAda_Bias' in policyType:

        policy = biasAda_Bias_policy(name="policy",
                                     env_spec=env.spec,
                                     grad_step_size=init_flr,
                                     hidden_nonlinearity=tf.nn.relu,
                                     hidden_sizes=(100, 100),
                                     init_flr_full=init_flr,
                                     latent_dim=ldim)

    elif 'basic' in policyType:
        policy = basic_policy(
            name="policy",
            env_spec=env.spec,
            grad_step_size=init_flr,
            hidden_nonlinearity=tf.nn.relu,
            hidden_sizes=(100, 100),
            extra_input_dim=(0 if extra_input is "" else extra_input_dim),
        )

    elif 'conv' in policyType:

        baseline = ZeroBaseline(env_spec=env.spec)

        policy = conv_policy(
            name="policy",
            latent_dim=ldim,
            policyType=policyType,
            env_spec=env.spec,
            init_flr=init_flr,
            hidden_nonlinearity=tf.nn.relu,
            hidden_sizes=(100, 100),
            extra_input_dim=(0 if extra_input is "" else extra_input_dim),
        )

    algo = algoClass(
        env=env,
        policy=policy,
        load_policy=load_policy,
        baseline=baseline,
        batch_size=fast_batch_size,  # number of trajs for alpha grad update
        max_path_length=max_path_length,
        meta_batch_size=
        meta_batch_size,  # number of tasks sampled for beta grad update
        num_grad_updates=num_grad_updates,  # number of alpha grad updates
        n_itr=1,  #100
        make_video=False,
        use_maml=True,
        use_pooled_goals=True,
        use_corr_term=use_corr_term,
        test_on_training_goals=test_on_training_goals,
        metalearn_baseline=False,
        # metalearn_baseline=False,
        limit_demos_num=limit_demos_num,
        test_goals_mult=1,
        step_size=meta_step_size,
        plot=False,
        beta_steps=beta_steps,
        adam_curve=None,
        adam_steps=adam_steps,
        pre_std_modifier=pre_std_modifier,
        l2loss_std_mult=l2loss_std_mult,
        importance_sampling_modifier=MOD_FUNC[''],
        post_std_modifier=post_std_modifier,
        expert_trajs_dir=EXPERT_TRAJ_LOCATION,
        expert_trajs_suffix='',
        seed=seed,
        extra_input=extra_input,
        extra_input_dim=(0 if extra_input is "" else extra_input_dim),
        plotDirPrefix=None,
        latent_dim=ldim,
        dagger=dagger,
        expert_policy_loc=expert_policy_loc)

    algo.train()
Beispiel #15
0
num_grad_updates = 1
meta_step_size = 0.01
num_imSteps = 50
use_maml = True

ratio = '_5_1'
expertDataLoc = '/home/russellm/mri_onPolicy/expertPolicyWeights/TRPO-push-20X20-v1/'
expertDataItr = 300

for meta_batch_size in meta_batch_sizes:
    for fast_learning_rate in fast_learning_rates:
        for fast_batch_size in fast_batch_sizes:

            stub(globals())

            baseEnv = SawyerPushEnv(tasks=None)
            env = FinnMamlEnv(
                FlatGoalEnv(baseEnv, obs_keys=['state_observation']))
            env = TfEnv(NormalizedBoxEnv(env))

            policy = MAMLGaussianMLPPolicy(
                name="policy",
                env_spec=env.spec,
                grad_step_size=fast_learning_rate,
                hidden_nonlinearity=tf.nn.relu,
                hidden_sizes=(100, 100),
            )

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            algo = MAMLTRPO(
Beispiel #16
0
def experiment(variant, comet_logger=None):
    from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
    from rllab.baselines.zero_baseline import ZeroBaseline
    from rllab.envs.normalized_env import normalize
    from rllab.misc.instrument import stub, run_experiment_lite
    
    from sandbox.rocky.tf.algos.vpg import VPG as vpg_basic
    from sandbox.rocky.tf.algos.vpg_biasADA import VPG as vpg_biasADA
    from sandbox.rocky.tf.algos.vpg_fullADA import VPG as vpg_fullADA
    from sandbox.rocky.tf.algos.vpg_conv import VPG as vpg_conv
    from sandbox.rocky.tf.algos.ppo import PPO as ppo
    
    # from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_adaptivestep_biastransform import MAMLGaussianMLPPolicy as fullAda_Bias_policy
    from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_biasonlyadaptivestep_biastransform import \
        MAMLGaussianMLPPolicy as biasAda_Bias_policy
    
    from multiworld.envs.mujoco.sawyer_xyz.push.sawyer_push import SawyerPushEnv
    from multiworld.envs.mujoco.sawyer_xyz.pickPlace.sawyer_pick_and_place import SawyerPickPlaceEnv
    from multiworld.envs.mujoco.sawyer_xyz.door.sawyer_door_open import SawyerDoorOpenEnv
    from multiworld.envs.mujoco.sawyer_xyz.multi_domain.push_door import Sawyer_MultiDomainEnv
    from multiworld.envs.mujoco.sawyer_xyz.pickPlace.sawyer_coffee import SawyerCoffeeEnv
    
    from rllab.envs.mujoco.ant_env_rand_goal_ring import AntEnvRandGoalRing
    from multiworld.core.flat_goal_env import FlatGoalEnv
    from multiworld.core.finn_maml_env import FinnMamlEnv
    from multiworld.core.wrapper_env import NormalizedBoxEnv
    from sandbox.rocky.tf.samplers.vectorized_sampler import VectorizedSampler
    # import gym
    
    from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_adaptivestep_ppo import \
            MAMLGaussianMLPPolicy as PPO_policy
    
    import pickle
    import argparse
    from sandbox.rocky.tf.envs.base import TfEnv
    import csv
    import joblib
    import numpy as np
    import pickle
    import tensorflow as tf
    
    print("%%%%%%%%%%%%%%%%%", comet_logger)
    seed = variant['seed']
    log_dir = variant['log_dir']
    n_parallel = variant['n_parallel']

    setup(seed, n_parallel, log_dir)

    init_file = variant['init_file']
    taskIndex = variant['taskIndex']
    n_itr = variant['n_itr']
    default_step = variant['default_step']
    policyType = variant['policyType']
    envType = variant['envType']

    tasksFile = path_to_multiworld + '/multiworld/envs/goals/' + variant['tasksFile'] + '.pkl'
    tasks = pickle.load(open(tasksFile, 'rb'))

    max_path_length = variant['max_path_length']

    use_images = 'conv' in policyType
    print("$$$$$$$$$$$$$$$ RL-TASK: ", str(tasks[taskIndex]), " $$$$$$$$$$$$$$$")
    if 'MultiDomain' in envType:
        baseEnv = Sawyer_MultiDomainEnv(tasks=tasks, image=use_images, mpl=max_path_length)

    elif 'Push' in envType:
        baseEnv = SawyerPushEnv(tasks=tasks, image=use_images, mpl=max_path_length)


    elif 'PickPlace' in envType:
        baseEnv = SawyerPickPlaceEnv(tasks=tasks, image=use_images, mpl=max_path_length)

    elif 'Door' in envType:
        baseEnv = SawyerDoorOpenEnv(tasks=tasks, image=use_images, mpl=max_path_length)

    elif 'Ant' in envType:
        env = TfEnv(normalize(AntEnvRandGoalRing()))
    elif 'Biped' in envType:
        # import terrainRLSim
        # from simAdapter import terrainRLSim
        import simAdapter
        import gym
        env = gym.make("PD_Biped2D_Gaps_Terrain-v0")
        env = TfEnv(normalize(env))
    elif 'Coffee' in envType:
        baseEnv = SawyerCoffeeEnv(mpl=max_path_length)

    else:
        raise AssertionError('')

    if envType in ['Push', 'PickPlace', 'Door']:
        if use_images:
            obs_keys = ['img_observation']
        else:
            obs_keys = ['state_observation']
        env = TfEnv(NormalizedBoxEnv(FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=obs_keys), reset_mode='idx')))

    # baseline = ZeroBaseline(env_spec=env.spec)
    baseline = LinearFeatureBaseline(env_spec = env.spec)
    batch_size = variant['batch_size']

    if policyType == 'fullAda_Bias':

        baseline = LinearFeatureBaseline(env_spec=env.spec)
        algo = vpg_fullADA(
            env=env,
            policy=None,
            load_policy=init_file,
            baseline=baseline,
            batch_size=batch_size,  # 2x
            max_path_length=max_path_length,
            n_itr=n_itr,
            # noise_opt = True,
            default_step=default_step,
            sampler_cls=VectorizedSampler,  # added by RK 6/19
            sampler_args=dict(n_envs=1),

            # reset_arg=np.asscalar(taskIndex),
            reset_arg=taskIndex,
            log_dir=log_dir,
            comet_logger=comet_logger
        )

    elif policyType == 'biasAda_Bias':

        algo = vpg_biasADA(
            env=env,
            policy=None,
            load_policy=init_file,
            baseline=baseline,
            batch_size=batch_size,  # 2x
            max_path_length=max_path_length,
            n_itr=n_itr,
            # noise_opt = True,
            default_step=default_step,
            sampler_cls=VectorizedSampler,  # added by RK 6/19
            sampler_args=dict(n_envs=1),
            # reset_arg=np.asscalar(taskIndex),
            reset_arg=taskIndex,
            log_dir=log_dir
        )
        
    elif policyType == 'PPO':

        policy = PPO_policy(
            name="policy",
            env_spec=env.spec,
            grad_step_size=variant['init_flr'],
            hidden_nonlinearity=tf.nn.relu,
            hidden_sizes=(128, 128),
            init_flr_full=variant['init_flr'],
            latent_dim=variant['ldim'],
            learn_std=False
        )
        
        algo = ppo(
            env=env,
            policy=policy,
            load_policy=init_file,
            baseline=baseline,
            batch_size=batch_size,  # 2x
            max_path_length=max_path_length,
            n_itr=n_itr,
            # noise_opt = True,
            default_step=default_step,
            sampler_cls=VectorizedSampler,  # added by RK 6/19
            sampler_args=dict(n_envs=1),
            # reset_arg=np.asscalar(taskIndex),
            reset_arg=taskIndex,
            log_dir=log_dir,
            comet_logger=comet_logger
        )

    elif policyType == 'basic':

        algo = vpg_basic(
            env=env,
            policy=None,
            load_policy=init_file,
            baseline=baseline,
            batch_size=batch_size,
            max_path_length=max_path_length,
            n_itr=n_itr,
            # step_size=10.0,
            sampler_cls=VectorizedSampler,  # added by RK 6/19
            sampler_args=dict(n_envs=1),

            reset_arg=taskIndex,
            optimizer=None,
            optimizer_args={'init_learning_rate': default_step,
                            'tf_optimizer_args': {'learning_rate': 0.5 * default_step},
                            'tf_optimizer_cls': tf.train.GradientDescentOptimizer},
            log_dir=log_dir
            # extra_input="onehot_exploration", # added by RK 6/19
            # extra_input_dim=5, # added by RK 6/19
        )


    elif 'conv' in policyType:

        algo = vpg_conv(
            env=env,
            policy=None,
            load_policy=init_file,
            baseline=baseline,
            batch_size=batch_size,  # 2x
            max_path_length=max_path_length,
            n_itr=n_itr,
            sampler_cls=VectorizedSampler,  # added by RK 6/19
            sampler_args=dict(n_envs=1),
            # noise_opt = True,
            default_step=default_step,
            # reset_arg=np.asscalar(taskIndex),
            reset_arg=taskIndex,
            log_dir=log_dir

        )

    else:
        raise AssertionError('Policy Type must be fullAda_Bias or biasAda_Bias')

    algo.train()
Beispiel #17
0
def experiment(variant):

    seed = variant['seed']

    tf.set_random_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    fast_learning_rate = variant['flr']

    fast_batch_size = variant[
        'fbs']  # 10 works for [0.1, 0.2], 20 doesn't improve much for [0,0.2]
    meta_batch_size = 20  # 10 also works, but much less stable, 20 is fairly stable, 40 is more stable
    max_path_length = 150
    num_grad_updates = 1
    meta_step_size = variant['mlr']

    regionSize = variant['regionSize']

    if regionSize == '20X20':

        tasksFile = '/root/code/multiworld/multiworld/envs/goals/pickPlace_20X20_6_8.pkl'

    else:
        assert regionSize == '60X30'

        tasksFile = '/root/code/multiworld/multiworld/envs/goals/pickPlace_60X30.pkl'

    tasks = pickle.load(open(tasksFile, 'rb'))

    envType = variant['envType']

    if envType == 'Push':

        baseEnv = SawyerPushEnv(tasks=tasks)
    else:
        assert (envType) == 'PickPlace'

        baseEnv = SawyerPickPlaceEnv(tasks=tasks)
    env = FinnMamlEnv(
        FlatGoalEnv(baseEnv,
                    obs_keys=['state_observation', 'state_desired_goal']))

    env = TfEnv(NormalizedBoxEnv(env))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = MAMLTRPO(
        env=env,
        policy=None,
        load_policy=variant['init_param_file'],
        baseline=baseline,
        batch_size=fast_batch_size,  # number of trajs for grad update
        max_path_length=max_path_length,
        meta_batch_size=meta_batch_size,
        num_grad_updates=num_grad_updates,
        n_itr=1000,
        use_maml=True,
        step_size=meta_step_size,
        plot=False,
    )

    import os

    saveDir = variant['saveDir']

    if os.path.isdir(saveDir) == False:
        os.mkdir(saveDir)

    logger.set_snapshot_dir(saveDir)
    logger.add_tabular_output(saveDir + 'progress.csv')

    algo.train()