Ejemplo n.º 1
0
 def __init__(self, *args, **kwargs):
     self.quick_init(locals())
     sawyer_env = SawyerEnv(*args, **kwargs)
     FlatGoalEnv.__init__(self,
                          sawyer_env,
                          obs_keys=['state_observation'],
                          goal_keys=['state_desired_goal'])
Ejemplo n.º 2
0
 def __init__(self, *args, **kwargs):
     self.quick_init(locals())
     sawyer_env = SawyerEnv(
         obj_low=(-0.0, 0.5, 0.02),
         obj_high=(0.0, 0.5, 0.02),
         goal_low=(-0.2, 0.6, 0.02),
         goal_high=(0.2, 0.8, 0.02),
         rew_mode='posPlace',
         *args, **kwargs)
     FlatGoalEnv.__init__(self, sawyer_env, obs_keys=['state_observation'], goal_keys=['state_desired_goal'])
Ejemplo n.º 3
0
def build_env(env_id):
    assert (env_id is not ""), "Unspecified environment."
    env = gym.make(env_id)
    if env_id == "SawyerPushAndReachEnvEasy-v0":
        env = FlatGoalEnv(ImageEnv(env, transpose=True),
                          obs_keys=['image_observation'],
                          append_goal_to_obs=True)
        env._max_episode_steps = 50

    return env
Ejemplo n.º 4
0
def run_sac(base_expl_env, base_eval_env, variant):
    expl_env = FlatGoalEnv(base_expl_env, append_goal_to_obs=True)
    eval_env = FlatGoalEnv(base_eval_env, append_goal_to_obs=True)
    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    M = variant["layer_size"]
    num_hidden = variant["num_hidden_layers"]
    qf1 = FlattenMlp(input_size=obs_dim + action_dim,
                     output_size=1,
                     hidden_sizes=[M] * num_hidden)
    qf2 = FlattenMlp(input_size=obs_dim + action_dim,
                     output_size=1,
                     hidden_sizes=[M] * num_hidden)
    target_qf1 = FlattenMlp(input_size=obs_dim + action_dim,
                            output_size=1,
                            hidden_sizes=[M] * num_hidden)
    target_qf2 = FlattenMlp(input_size=obs_dim + action_dim,
                            output_size=1,
                            hidden_sizes=[M] * num_hidden)
    policy = TanhGaussianPolicy(obs_dim=obs_dim,
                                action_dim=action_dim,
                                hidden_sizes=[M] * num_hidden)
    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        policy,
    )
    replay_buffer = EnvReplayBuffer(
        variant["replay_buffer_size"],
        expl_env,
    )
    trainer = SACTrainer(env=eval_env,
                         policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         **variant["trainer_kwargs"])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant["algorithm_kwargs"])
    algorithm.train()
Ejemplo n.º 5
0
def train_distilled_policy(num_tasks,
                           policies=None,
                           epochs_per_task=500,
                           batch_size=100,
                           lr=1e-3):
    """
    Trains a distilled policy (using an optimal expert or a trained expert).
    Saves the policy in a .pkl file along with the env and the loss history.

    :param num_tasks: Number of tasks/policies to distill.
    :param policies: A list of length `num_tasks` containing all the individual experts.
    :param epochs_per_task: Number of training epochs per task.
    :param batch_size: Batch sample size per update step.
    :param lr: Learning rate of the optimizer.
    :return: The trained policy and the environment.
    """
    base_env = PointMassEnv(n=num_tasks)
    env = FlatGoalEnv(base_env, append_goal_to_obs=True)
    obs_dim = env.observation_space.low.size
    act_dim = env.action_space.low.size

    policy = TanhMlpPolicy(input_size=obs_dim,
                           output_size=act_dim,
                           hidden_sizes=[64, 64]
                           # hidden_sizes=[64, 64, 64]
                           )

    loss_history = []
    criterion = nn.MSELoss()
    optim = Adam(policy.parameters(), lr=lr)
    for epoch in range(epochs_per_task * num_tasks):
        if policies:
            assert len(policies) == num_tasks, "Number of expert policies needs " \
                                               "to be equal to the number of tasks"
        obs, act_labels = get_batch(env, batch_size, policies)

        obs_var, act_labels_var = Variable(torch.from_numpy(obs)).float(), \
                                  Variable(torch.from_numpy(act_labels)).float()
        acts = policy(obs_var)

        optim.zero_grad()
        loss = criterion(acts, act_labels_var)
        loss.backward()
        optim.step()

        loss_val = loss.data.item()
        loss_history.append(loss_val)
        if epoch % 50 == 0:
            print("epoch: {0} \t loss: {1}".format(epoch, loss_val))

    print("FINAL loss: {1}".format(epoch, loss.data.item()))
    out = dict(policy=policy, env=env, loss_history=loss_history)
    appended_path = "-from_expert_policies" if policies else ""
    path = "./logs/policy-distillation/model-{0}{1}.pkl".format(
        num_tasks, appended_path)
    with open(path, "wb") as f:
        pickle.dump(out, f, protocol=pickle.HIGHEST_PROTOCOL)

    return policy, env
Ejemplo n.º 6
0
def flatten_multiworld_env(env):
    from multiworld.core.flat_goal_env import FlatGoalEnv
    flat_env = FlatGoalEnv(env,
                           obs_keys=['image_observation'],
                           goal_keys=['image_desired_goal'],
                           append_goal_to_obs=True)
    env = GymAdapter(env=flat_env)
    return env
Ejemplo n.º 7
0
    def __init__(self, use_hand_cam=False):
        self.use_hand_cam = use_hand_cam

        # Transformation matrix from camera's frame -> based frame
        # self.TRANSFORMATION_MATRIX = np.array([[0.11491126, 0.88002959, -0.46080724, 1.0704251219017176],
        #                                        [0.99326509, -0.0948642, 0.06652247, 0.02981537521689703],
        #                                        [0.01482763, -0.46534793, -0.88500364, 0.6268248987975156],
        #                                        [0., 0., 0., 1.]])
        self.TRANSFORMATION_MATRIX = np.array(
            [[-0.15316623, 0.86485568, -0.47808446, 1.06231099],
             [0.97058596, 0.22259649, 0.09172615, -0.08591922],
             [0.18574981, -0.44997272, -0.87351105, 0.62519807],
             [0., 0., 0., 1.]])

        self.angle_defaul_cam = [
            -0.9347021484375, -0.066611328125, -2.09948828125,
            -2.4536884765625, -1.90233984375, -2.909759765625, -2.622689453125
        ]
        self.angle_init_for_grasp = [
            0.51219827, -0.35472363, -0.69057131, 1.43175006, -2.19978213,
            -0.83249319, -1.90052831
        ]
        self.angle_for_place_object = [
            -0.34514549, 0.24693164, -1.2170068, 1.22242475, 1.65923345,
            1.15603614, 0.06596191
        ]
        self.msg_close = True
        self.msg_open = False

        env = SawyerReachXYZEnv(
            action_mode='position',
            position_action_scale=0.1,
            config_name='austri_config',
            reset_free=False,
            max_speed=0.05,
            fix_goal=True,
        )
        self.env = FlatGoalEnv(env, append_goal_to_obs=True)

        os.system('clear')
        print('[AIM-INFO] Initializing robotic grasping...')
        for _ in range(5):
            self.move_to_angle(angle=self.angle_init_for_grasp, duration=2)
        print('[AIM-INFO] Initialize done.')
Ejemplo n.º 8
0
def experiment(variant):
    env = Point2DEnv(**variant['env_kwargs'])
    env = FlatGoalEnv(env)
    env = NormalizedBoxEnv(env)

    action_dim = int(np.prod(env.action_space.shape))
    obs_dim = int(np.prod(env.observation_space.shape))

    qf1 = FlattenMlp(input_size=obs_dim + action_dim,
                     output_size=1,
                     **variant['qf_kwargs'])
    qf2 = FlattenMlp(input_size=obs_dim + action_dim,
                     output_size=1,
                     **variant['qf_kwargs'])
    target_qf1 = FlattenMlp(input_size=obs_dim + action_dim,
                            output_size=1,
                            **variant['qf_kwargs'])
    target_qf2 = FlattenMlp(input_size=obs_dim + action_dim,
                            output_size=1,
                            **variant['qf_kwargs'])
    policy = TanhGaussianPolicy(obs_dim=obs_dim,
                                action_dim=action_dim,
                                **variant['policy_kwargs'])
    eval_env = expl_env = env

    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        policy,
    )
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    trainer = TwinSACTrainer(env=eval_env,
                             policy=policy,
                             qf1=qf1,
                             qf2=qf2,
                             target_qf1=target_qf1,
                             target_qf2=target_qf2,
                             **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        data_buffer=replay_buffer,
        **variant['algo_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Ejemplo n.º 9
0
def create_image_48_sawyer_push_forward_v0():
    from multiworld.core.flat_goal_env import FlatGoalEnv
    from multiworld.core.image_env import ImageEnv
    from multiworld.envs.mujoco.cameras import sawyer_pusher_camera_upright_v2
    image_env = ImageEnv(
        wrapped_env=gym.make('BaseSawyerPushForwardEnv-v0'),
        imsize=48,
        init_camera=sawyer_pusher_camera_upright_v2,
        normalize=True,
        )
    return FlatGoalEnv(image_env, obs_keys=['image_observation'])
Ejemplo n.º 10
0
def create_image_48_sawyer_pick_and_place_v0():
    from multiworld.core.flat_goal_env import FlatGoalEnv
    from multiworld.core.image_env import ImageEnv
    from multiworld.envs.mujoco.cameras import sawyer_pick_and_place_camera_zoomed
    wrapped_env = gym.make('BaseSawyerPickAndPlaceEnv-v0')
    state_desired_goal = wrapped_env.fixed_goal
    goal_dim = len(state_desired_goal)
    imsize = 48
    image_env = ImageEnv(
        wrapped_env=wrapped_env,
        imsize=imsize,
        init_camera=sawyer_pick_and_place_camera_zoomed,
        normalize=True,
        presampled_goals={'state_desired_goal': state_desired_goal.reshape(1,goal_dim),
                          'image_desired_goal': np.zeros((1, imsize*imsize*3))},
        )
    return FlatGoalEnv(image_env, obs_keys=['image_observation'])
Ejemplo n.º 11
0
def run_task(*_):
    env = FlatGoalEnv(SawyerPickEnv(), obs_keys=["state_observation"])
    env = TfEnv(normalize(env))

    policy = GaussianMLPPolicy(name="policy",
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
                batch_size=4000,
                max_path_length=500,
                n_itr=500,
                discount=0.99,
                step_size=0.01,
                plot=True)
    algo.train()
def run_task(*_):
    with LocalRunner() as runner:
        env = FlatGoalEnv(SawyerReachXYZEnv(), obs_keys=["state_observation"])
        env = TfEnv(normalize(env))

        policy = GaussianMLPPolicy(name="policy",
                                   env_spec=env.spec,
                                   hidden_sizes=(32, 32))

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(env=env,
                    policy=policy,
                    baseline=baseline,
                    max_path_length=500,
                    discount=0.99,
                    max_kl_step=0.01)

        runner.setup(algo, env)
        runner.train(n_epochs=500, batch_size=4000, plot=True)
Ejemplo n.º 13
0
def create_image_48_sawyer_door_pull_hook_v0():
    from multiworld.core.flat_goal_env import FlatGoalEnv
    from multiworld.core.image_env import ImageEnv
    from multiworld.envs.mujoco.cameras import sawyer_door_env_camera_v0
    import numpy as np

    wrapped_env = gym.make('BaseSawyerDoorHookEnv-v0')
    imsize=48
    imsize_flat=imsize*imsize*3
    image_env = ImageEnv(
        wrapped_env=wrapped_env,
        imsize=imsize,
        init_camera=sawyer_door_env_camera_v0,
        normalize=True,
        presampled_goals={
        'state_desired_goal': 
        np.expand_dims(wrapped_env.fixed_goal, axis=0),
        'image_desired_goal':
        np.zeros((1, imsize_flat))},
        non_presampled_goal_img_is_garbage=True,
        )
    return FlatGoalEnv(image_env, obs_keys=['image_observation'])
Ejemplo n.º 14
0
 def __init__(self,
              steps_needed_to_solve,
              planning_horizon,
              task_horizon_factor=2):
     env = gym.make("PointmassUWallTrainEnvBig-v1")
     env.action_scale = self.PATH_LENGTH_TO_SOLVE / steps_needed_to_solve
     env = FlatGoalEnv(env, append_goal_to_obs=True)
     PointmassUWallConfigModule.TASK_HORIZON = int(task_horizon_factor *
                                                   steps_needed_to_solve)
     PointmassUWallConfigModule.PLAN_HOR = planning_horizon
     PointmassUWallConfigModule.NROLLOUTS_PER_ITER = math.ceil(
         PointmassUWallConfigModule.NUM_STEPS_TOTAL /
         (PointmassUWallConfigModule.TASK_HORIZON *
          PointmassUWallConfigModule.NTRAIN_ITERS))
     print('-------------')
     print("task horizon", PointmassUWallConfigModule.TASK_HORIZON)
     print("plan horizon", PointmassUWallConfigModule.PLAN_HOR)
     print("nrolls per iter", PointmassUWallConfigModule.NROLLOUTS_PER_ITER)
     print("action_scale", env.wrapped_env.action_scale)
     print('-------------')
     self.ENV = env
     cfg = tf.ConfigProto()
     cfg.gpu_options.allow_growth = True
     self.SESS = tf.Session(config=cfg)
     self.NN_TRAIN_CFG = {"epochs": 2}
     self.OPT_CFG = {
         "Random": {
             "popsize": 10
         },
         "CEM": {
             "popsize": 5,
             "num_elites": 2,
             "max_iters": 2,
             "alpha": 0.1,
         }
     }
     self.UPDATE_FNS = []
Ejemplo n.º 15
0
def experiment(variant, comet_exp_key=None):
    if comet_exp_key is not None:
        from rllab.misc.comet_logger import CometContinuedLogger, CometLogger
        from comet_ml import Experiment, ExistingExperiment
        # comet_log = CometContinuedLogger(api_key="KWwx7zh6I2uw6oQMkpEo3smu0", previous_experiment_key=variant['comet_exp_key'])
        comet_log = ExistingExperiment(api_key="KWwx7zh6I2uw6oQMkpEo3smu0", previous_experiment=variant['comet_exp_key'])
        # comet_log = CometLogger(api_key="KWwx7zh6I2uw6oQMkpEo3smu0",
        #                     project_name="ml4l3", workspace="glenb")
        comet_log.set_name("test seq train")
        # comet_log = comet_exp_key
        print (comet_log)
    else:
        comet_log = None
    print ("loading libraries")
    from sandbox.rocky.tf.algos.maml_il import MAMLIL

    from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
    from rllab.baselines.gaussian_mlp_baseline import GaussianMLPBaseline
    from rllab.baselines.maml_gaussian_mlp_baseline import MAMLGaussianMLPBaseline
    from rllab.baselines.zero_baseline import ZeroBaseline
    from rllab.envs.normalized_env import normalize
    from rllab.misc.instrument import stub, run_experiment_lite
    from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy import MAMLGaussianMLPPolicy as basic_policy
    # from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_adaptivestep import MAMLGaussianMLPPolicy as fullAda_basic_policy
    from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_adaptivestep_ppo import \
        MAMLGaussianMLPPolicy as PPO_policy
    from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_adaptivestep_biastransform import \
        MAMLGaussianMLPPolicy as fullAda_Bias_policy
    from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_biasonlyadaptivestep_biastransform import \
        MAMLGaussianMLPPolicy as biasAda_Bias_policy
    from sandbox.rocky.tf.policies.maml_minimal_conv_gauss_mlp_policy import MAMLGaussianMLPPolicy as conv_policy
    
    from sandbox.rocky.tf.optimizers.quad_dist_expert_optimizer import QuadDistExpertOptimizer
    from sandbox.rocky.tf.optimizers.first_order_optimizer import FirstOrderOptimizer
    from sandbox.rocky.tf.envs.base import TfEnv
    import sandbox.rocky.tf.core.layers as L
    
    from rllab.envs.mujoco.ant_env_rand_goal_ring import AntEnvRandGoalRing
    from multiworld.envs.mujoco.sawyer_xyz.push.sawyer_push import SawyerPushEnv
    from multiworld.envs.mujoco.sawyer_xyz.pickPlace.sawyer_pick_and_place import SawyerPickPlaceEnv
    from multiworld.envs.mujoco.sawyer_xyz.door.sawyer_door_open import SawyerDoorOpenEnv
    from multiworld.core.flat_goal_env import FlatGoalEnv
    from multiworld.core.finn_maml_env import FinnMamlEnv
    from multiworld.core.wrapper_env import NormalizedBoxEnv
    
    import tensorflow as tf
    import time
    from rllab.envs.gym_env import GymEnv
    
    from maml_examples.maml_experiment_vars import MOD_FUNC
    import numpy as np
    import random as rd
    import pickle
    
    print ("Done loading libraries")
    
    seed = variant['seed'];
    n_parallel = 1;
    log_dir = variant['log_dir']

    x=0
    setup(seed, n_parallel, log_dir)
    fast_batch_size = variant['fbs'];
    meta_batch_size = variant['mbs']
    adam_steps = variant['adam_steps'];
    max_path_length = variant['max_path_length']
    dagger = variant['dagger'];
    expert_policy_loc = variant['expert_policy_loc']
    ldim = variant['ldim'];
    init_flr = variant['init_flr'];
    policyType = variant['policyType'];
    use_maesn = variant['use_maesn']
    EXPERT_TRAJ_LOCATION = variant['expertDataLoc']
    envType = variant['envType']
    tasksFile = path_to_multiworld + 'multiworld/envs/goals/' + variant['tasksFile'] + '.pkl'
    all_tasks = pickle.load(open(tasksFile, 'rb'))
    assert meta_batch_size <= len(all_tasks), "meta batch size wrong: " + str(meta_batch_size) + " <= " + str(len(all_tasks))
    tasks = all_tasks[:meta_batch_size]
    print("^^^^^^^^^^^^^^^^ meta_tasks: ", tasks, " ^^^^^^^^^^^^^^^^ ")

    use_images = 'conv' in policyType

    if 'Push' == envType:
        baseEnv = SawyerPushEnv(tasks=tasks, image=use_images, mpl=max_path_length)

    elif envType == 'sparsePush':
        baseEnv = SawyerPushEnv(tasks=tasks, image=use_images, mpl=max_path_length, rewMode='l2Sparse')


    elif 'PickPlace' in envType:
        baseEnv = SawyerPickPlaceEnv(tasks=tasks, image=use_images, mpl=max_path_length)

    elif 'Door' in envType:
        baseEnv = SawyerDoorOpenEnv(tasks=tasks, image=use_images, mpl=max_path_length)

    elif 'Ant' in envType:
        env = TfEnv(normalize(AntEnvRandGoalRing()))

    elif 'claw' in envType:
        env = TfEnv(DClawScrewRandGoal())

    else:
        assert True == False

    if envType in ['Push', 'PickPlace', 'Door']:
        if use_images:
            obs_keys = ['img_observation']
        else:
            obs_keys = ['state_observation']
        env = TfEnv(NormalizedBoxEnv(FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=obs_keys), reset_mode='idx')))

    algoClass = MAMLIL
    baseline = LinearFeatureBaseline(env_spec=env.spec)

    load_policy = variant['load_policy']

    if load_policy != None:
        policy = None
        load_policy = variant['load_policy']
        # if 'conv' in load_policy:
        #     baseline = ZeroBaseline(env_spec=env.spec)

    elif 'fullAda_PPO' in policyType:

        policy = PPO_policy(
            name="policy",
            env_spec=env.spec,
            grad_step_size=init_flr,
            hidden_nonlinearity=tf.nn.relu,
            hidden_sizes=(100, 100),
            init_flr_full=init_flr,
            latent_dim=ldim
        )
        
    elif 'fullAda_Bias' in policyType:

        policy = fullAda_Bias_policy(
            name="policy",
            env_spec=env.spec,
            grad_step_size=init_flr,
            hidden_nonlinearity=tf.nn.relu,
            hidden_sizes=(100, 100),
            init_flr_full=init_flr,
            latent_dim=ldim
        )

    elif 'biasAda_Bias' in policyType:

        policy = biasAda_Bias_policy(
            name="policy",
            env_spec=env.spec,
            grad_step_size=init_flr,
            hidden_nonlinearity=tf.nn.relu,
            hidden_sizes=(100, 100),
            init_flr_full=init_flr,
            latent_dim=ldim
        )

    elif 'basic' in policyType:
        policy = basic_policy(
            name="policy",
            env_spec=env.spec,
            grad_step_size=init_flr,
            hidden_nonlinearity=tf.nn.relu,
            hidden_sizes=(100, 100),
            extra_input_dim=(0 if extra_input is "" else extra_input_dim),
        )


    elif 'conv' in policyType:

        baseline = ZeroBaseline(env_spec=env.spec)

        policy = conv_policy(
            name="policy",
            latent_dim=ldim,
            policyType=policyType,
            env_spec=env.spec,
            init_flr=init_flr,

            hidden_nonlinearity=tf.nn.relu,
            hidden_sizes=(100, 100),
            extra_input_dim=(0 if extra_input is "" else extra_input_dim),
        )

    print("|||||||||||||||||||||||||||||||||||||||||||||||", variant['n_itr'])
    
    beta_steps = 1 ;
    meta_step_size = 0.01 ; num_grad_updates = 1
    pre_std_modifier = 1.0 ; post_std_modifier = 0.00001 
    limit_demos_num = None 

    algo = algoClass(
        env=env,
        policy=policy,
        load_policy=load_policy,
        baseline=baseline,
        batch_size=fast_batch_size,  # number of trajs for alpha grad update
        max_path_length=max_path_length,
        meta_batch_size=meta_batch_size,  # number of tasks sampled for beta grad update
        num_grad_updates=num_grad_updates,  # number of alpha grad updates
        n_itr=variant['n_itr'],
        make_video=False,
        use_maml=True,
        use_pooled_goals=True,
        use_corr_term=use_corr_term,
        test_on_training_goals=test_on_training_goals,
        metalearn_baseline=False,
        # metalearn_baseline=False,
        limit_demos_num=limit_demos_num,
        test_goals_mult=1,
        step_size=meta_step_size,
        plot=False,
        beta_steps=beta_steps,
        adam_curve=None,
        adam_steps=adam_steps,
        pre_std_modifier=pre_std_modifier,
        l2loss_std_mult=l2loss_std_mult,
        importance_sampling_modifier=MOD_FUNC[''],
        post_std_modifier=post_std_modifier,
        expert_trajs_dir=EXPERT_TRAJ_LOCATION,
        expert_trajs_suffix='',
        seed=seed,
        extra_input=extra_input,
        extra_input_dim=(0 if extra_input is "" else extra_input_dim),
        plotDirPrefix=None,
        latent_dim=ldim,
        dagger=dagger,
        expert_policy_loc=expert_policy_loc,
        comet_logger=comet_log,
        outerIteration=variant['outer_Iteration'],
        use_ppo=True
    )

    algo.train()
Ejemplo n.º 16
0
 def __init__(self):
     super().__init__()
     env = gym.make('Point2DFixedGoalEnv-v0')
     env = FlatGoalEnv(env, append_goal_to_obs=False)
     self.ENV = env
Ejemplo n.º 17
0
from rllab.envs.normalized_env import normalize
from rllab.misc.instrument import stub, run_experiment_lite
from multiworld.envs.mujoco.sawyer_xyz.push.sawyer_push import SawyerPushEnv

from sandbox.rocky.tf.envs.base import TfEnv
from multiworld.core.flat_goal_env import FlatGoalEnv
from multiworld.core.finn_maml_env import FinnMamlEnv
from multiworld.core.wrapper_env import NormalizedBoxEnv

stub(globals())
rate = 0.01
mode = 'local'

import tensorflow as tf
for goal in range(1, 100):
    baseEnv = FlatGoalEnv(SawyerPushEnv(tasks=None),
                          obs_keys=['state_observation'])
    env = TfEnv(NormalizedBoxEnv(FinnMamlEnv(baseEnv, reset_mode='task')))
    #env = WheeledEnvGoal()

    env = TfEnv(env)
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_nonlinearity=tf.nn.relu,
                               hidden_sizes=(100, 100))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
                batch_size=20000,
Ejemplo n.º 18
0
def encoder_wrapped_env(variant):
    representation_size = 128
    output_classes = 20

    model_class = variant.get('model_class', TimestepPredictionModel)
    model = model_class(
        representation_size,
        # decoder_output_activation=decoder_activation,
        output_classes=output_classes,
        **variant['model_kwargs'],
    )
    # model = torch.nn.DataParallel(model)

    model_path = variant.get("model_path")
    # model = load_local_or_remote_file(model_path)
    state_dict = torch.load(model_path)
    model.load_state_dict(state_dict)
    model.to(ptu.device)
    model.eval()

    traj = np.load(variant.get("desired_trajectory"), allow_pickle=True)[0]

    goal_image = traj["observations"][-1]["image_observation"]
    goal_image = goal_image.reshape(1, 3, 500, 300).transpose([0, 1, 3, 2]) / 255.0
    # goal_image = goal_image.reshape(1, 300, 500, 3).transpose([0, 3, 1, 2]) / 255.0 # BECAUSE RLBENCH DEMOS ARENT IMAGE_ENV WRAPPED
    # goal_image = goal_image[:, :, :240, 60:500]
    goal_image = goal_image[:, :, 60:, 60:500]
    goal_image_pt = ptu.from_numpy(goal_image)
    save_image(goal_image_pt.data.cpu(), 'gitignore/goal.png', nrow=1)
    goal_latent = model.encode(goal_image_pt).detach().cpu().numpy().flatten()

    initial_image = traj["observations"][0]["image_observation"]
    initial_image = initial_image.reshape(1, 3, 500, 300).transpose([0, 1, 3, 2]) / 255.0
    # initial_image = initial_image.reshape(1, 300, 500, 3).transpose([0, 3, 1, 2]) / 255.0
    # initial_image = initial_image[:, :, :240, 60:500]
    initial_image = initial_image[:, :, 60:, 60:500]
    initial_image_pt = ptu.from_numpy(initial_image)
    save_image(initial_image_pt.data.cpu(), 'gitignore/initial.png', nrow=1)
    initial_latent = model.encode(initial_image_pt).detach().cpu().numpy().flatten()

    # Move these to td3_bc and bc_v3 (or at least type for reward_params)
    reward_params = dict(
        goal_latent=goal_latent,
        initial_latent=initial_latent,
        type=variant["reward_params_type"],
    )

    config_params = variant.get("config_params")

    env = variant['env_class'](**variant['env_kwargs'])
    env = ImageEnv(env,
        recompute_reward=False,
        transpose=True,
        image_length=450000,
        reward_type="image_distance",
        # init_camera=sawyer_pusher_camera_upright_v2,
    )
    env = EncoderWrappedEnv(
        env,
        model,
        reward_params,
        config_params,
        **variant.get("encoder_wrapped_env_kwargs", dict())
    )
    env = FlatGoalEnv(env, obs_keys=["state_observation", ])

    return env
Ejemplo n.º 19
0
def _pointmass_fixed_goal_experiment(vae_latent_size,
                                     replay_buffer_size,
                                     cnn_kwargs,
                                     vae_kwargs,
                                     policy_kwargs,
                                     qf_kwargs,
                                     e2e_trainer_kwargs,
                                     sac_trainer_kwargs,
                                     algorithm_kwargs,
                                     eval_path_collector_kwargs=None,
                                     expl_path_collector_kwargs=None,
                                     **kwargs):
    if expl_path_collector_kwargs is None:
        expl_path_collector_kwargs = {}
    if eval_path_collector_kwargs is None:
        eval_path_collector_kwargs = {}
    from multiworld.core.image_env import ImageEnv
    from multiworld.envs.pygame.point2d import Point2DEnv
    from multiworld.core.flat_goal_env import FlatGoalEnv
    env = Point2DEnv(
        images_are_rgb=True,
        render_onscreen=False,
        show_goal=False,
        ball_radius=2,
        render_size=48,
        fixed_goal=(0, 0),
    )
    env = ImageEnv(env, imsize=env.render_size, transpose=True, normalize=True)
    env = FlatGoalEnv(env)  #, append_goal_to_obs=True)
    input_width, input_height = env.image_shape

    action_dim = int(np.prod(env.action_space.shape))
    vae = ConvVAE(
        representation_size=vae_latent_size,
        input_channels=3,
        imsize=input_width,
        decoder_output_activation=nn.Sigmoid(),
        # decoder_distribution='gaussian_identity_variance',
        **vae_kwargs)
    encoder = Vae2Encoder(vae)

    def make_cnn():
        return networks.CNN(input_width=input_width,
                            input_height=input_height,
                            input_channels=3,
                            output_conv_channels=True,
                            output_size=None,
                            **cnn_kwargs)

    def make_qf():
        return networks.MlpQfWithObsProcessor(obs_processor=nn.Sequential(
            encoder,
            networks.Flatten(),
        ),
                                              output_size=1,
                                              input_size=action_dim +
                                              vae_latent_size,
                                              **qf_kwargs)

    qf1 = make_qf()
    qf2 = make_qf()
    target_qf1 = make_qf()
    target_qf2 = make_qf()
    action_dim = int(np.prod(env.action_space.shape))
    policy_cnn = make_cnn()
    policy = TanhGaussianPolicyAdapter(
        nn.Sequential(policy_cnn, networks.Flatten()),
        policy_cnn.conv_output_flat_size, action_dim, **policy_kwargs)
    eval_env = expl_env = env

    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(eval_env, eval_policy,
                                           **eval_path_collector_kwargs)
    replay_buffer = EnvReplayBuffer(
        replay_buffer_size,
        expl_env,
    )
    vae_trainer = VAETrainer(vae)
    sac_trainer = SACTrainer(env=eval_env,
                             policy=policy,
                             qf1=qf1,
                             qf2=qf2,
                             target_qf1=target_qf1,
                             target_qf2=target_qf2,
                             **sac_trainer_kwargs)
    trainer = End2EndSACTrainer(
        sac_trainer=sac_trainer,
        vae_trainer=vae_trainer,
        **e2e_trainer_kwargs,
    )
    expl_path_collector = MdpPathCollector(expl_env, policy,
                                           **expl_path_collector_kwargs)
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **algorithm_kwargs)
    algorithm.to(ptu.device)
    algorithm.train()
Ejemplo n.º 20
0
class ObjectGrasping:
    """
    This class provides the utilities to grasp object including:
        - Find object's position based on /ar_pose_marker service
        - Move joints to specific angles
        - Move end-effector to specific position by using policy learned by RL algorithm
    """
    def __init__(self, use_hand_cam=False):
        self.use_hand_cam = use_hand_cam

        # Transformation matrix from camera's frame -> based frame
        # self.TRANSFORMATION_MATRIX = np.array([[0.11491126, 0.88002959, -0.46080724, 1.0704251219017176],
        #                                        [0.99326509, -0.0948642, 0.06652247, 0.02981537521689703],
        #                                        [0.01482763, -0.46534793, -0.88500364, 0.6268248987975156],
        #                                        [0., 0., 0., 1.]])
        self.TRANSFORMATION_MATRIX = np.array(
            [[-0.15316623, 0.86485568, -0.47808446, 1.06231099],
             [0.97058596, 0.22259649, 0.09172615, -0.08591922],
             [0.18574981, -0.44997272, -0.87351105, 0.62519807],
             [0., 0., 0., 1.]])

        self.angle_defaul_cam = [
            -0.9347021484375, -0.066611328125, -2.09948828125,
            -2.4536884765625, -1.90233984375, -2.909759765625, -2.622689453125
        ]
        self.angle_init_for_grasp = [
            0.51219827, -0.35472363, -0.69057131, 1.43175006, -2.19978213,
            -0.83249319, -1.90052831
        ]
        self.angle_for_place_object = [
            -0.34514549, 0.24693164, -1.2170068, 1.22242475, 1.65923345,
            1.15603614, 0.06596191
        ]
        self.msg_close = True
        self.msg_open = False

        env = SawyerReachXYZEnv(
            action_mode='position',
            position_action_scale=0.1,
            config_name='austri_config',
            reset_free=False,
            max_speed=0.05,
            fix_goal=True,
        )
        self.env = FlatGoalEnv(env, append_goal_to_obs=True)

        os.system('clear')
        print('[AIM-INFO] Initializing robotic grasping...')
        for _ in range(5):
            self.move_to_angle(angle=self.angle_init_for_grasp, duration=2)
        print('[AIM-INFO] Initialize done.')

    def go_to_camera_view_position(self):
        duration = 2
        self.move_to_angle(self.angle_defaul_cam, duration)

    def go_to_place_position(self):
        duration = 7
        self.move_to_angle(self.angle_for_place_object, duration)

    def move_to_angle(self, angle, duration):
        rospy.wait_for_service('angle_action')
        try:
            execute_action = rospy.ServiceProxy('angle_action',
                                                angle_action,
                                                persistent=True)
            execute_action(angle, duration)
            return None
        except rospy.ServiceException as e:
            print('[AIM-ERROR] Error when moving to angle: ', angle)

    def locate_object(self):
        service_name = "/locate_object"
        service = rospy.ServiceProxy(service_name, target)
        service.wait_for_service()
        print("[AIM-INFO] Connect to service {} successfully.".format(
            service_name))

        while True:
            req = targetRequest()
            req.data = 0
            resp = service.call(req)
            if resp.pose is not ():
                print('[AIM-INFO] Object detected')
                break
            elif self.use_hand_cam:
                print('[AIM-INFO] Cannot detect object...')
                self.go_to_camera_view_position()
            else:
                print('[AIM-INFO] Cannot detect object...')

        return resp.pose

    def get_object_location(self):
        obj_pos_cam_frame = self.locate_object()  # w.r.t. camera frame

        print(
            "[AIM-DEBUG] Object in camera frame: (%.4f, %.4f, %.4f)" %
            (obj_pos_cam_frame[0], obj_pos_cam_frame[1], obj_pos_cam_frame[2]))
        if self.use_hand_cam:
            obj_pos_based_frame = list(obj_pos_cam_frame)
        else:
            obj_pos_homo = np.hstack([obj_pos_cam_frame, 1])
            obj_pos_based_frame = np.matmul(self.TRANSFORMATION_MATRIX,
                                            obj_pos_homo)

        print("[AIM-DEBUG] Object in based frame: (%.4f, %.4f, %.4f)" %
              (obj_pos_based_frame[0], obj_pos_based_frame[1],
               obj_pos_based_frame[2]))
        obj_pos_based_frame[2] = obj_pos_based_frame[2] + 0.15
        print(
            "[AIM-DEBUG] Object in based frame with offset: (%.4f, %.4f, %.4f)"
            % (obj_pos_based_frame[0], obj_pos_based_frame[1],
               obj_pos_based_frame[2]))

        return list(obj_pos_based_frame[:3])

    def request_grasp(self, data):
        rospy.wait_for_service('grasping')
        execute_action = rospy.ServiceProxy('grasping',
                                            grasping,
                                            persistent=True)
        execute_action(data)

    def move_to_pos(self, goal):
        self.env.wrapped_env._state_goal = np.array(goal)
        print('[AIM-INFO] Moving to reset position...')
        for _ in range(5):
            self.env.reset()
        print('[AIM-INFO] Starting move to target position...')
        run_policy(self.env, get_action, 15, 1, False, grasp=True)
Ejemplo n.º 21
0
def experiment(variant):
    base_expl_env = PointMassEnv(n=variant["num_tasks"],
                                 reward_type=variant["reward_type"])
    expl_env = FlatGoalEnv(base_expl_env, append_goal_to_obs=True)

    base_eval_env = PointMassEnv(n=variant["num_tasks"],
                                 reward_type=variant["reward_type"])
    eval_env = FlatGoalEnv(base_eval_env, append_goal_to_obs=True)
    obs_dim = expl_env.observation_space.low.size
    action_dim = expl_env.action_space.low.size

    print(expl_env.observation_space, expl_env.action_space)
    qf1 = FlattenMlp(input_size=obs_dim + action_dim,
                     output_size=1,
                     **variant['qf_kwargs'])
    qf2 = FlattenMlp(input_size=obs_dim + action_dim,
                     output_size=1,
                     **variant['qf_kwargs'])
    target_qf1 = FlattenMlp(input_size=obs_dim + action_dim,
                            output_size=1,
                            **variant['qf_kwargs'])
    target_qf2 = FlattenMlp(input_size=obs_dim + action_dim,
                            output_size=1,
                            **variant['qf_kwargs'])
    policy = TanhMlpPolicy(input_size=obs_dim,
                           output_size=action_dim,
                           **variant['policy_kwargs'])
    target_policy = TanhMlpPolicy(input_size=obs_dim,
                                  output_size=action_dim,
                                  **variant['policy_kwargs'])
    es = GaussianStrategy(
        action_space=expl_env.action_space,
        max_sigma=0.1,
        min_sigma=0.1,  # Constant sigma
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    eval_path_collector = MdpPathCollector(
        eval_env,
        policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        exploration_policy,
    )
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    trainer = TD3Trainer(policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         target_policy=target_policy,
                         **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.train()
Ejemplo n.º 22
0
def experiment(variant):

    seed = variant['seed']
    n_parallel = variant['n_parallel']
    log_dir = variant['log_dir']
    setup(seed, n_parallel, log_dir)

    fast_learning_rate = variant['flr']

    fast_batch_size = variant[
        'fbs']  # 10 works for [0.1, 0.2], 20 doesn't improve much for [0,0.2]
    meta_batch_size = 20  # 10 also works, but much less stable, 20 is fairly stable, 40 is more stable
    max_path_length = 150
    num_grad_updates = 1
    meta_step_size = variant['mlr']

    tasksFile = '/root/code/multiworld/multiworld/envs/goals/Door_60X20X20.pkl'

    tasks = pickle.load(open(tasksFile, 'rb'))

    baseEnv = SawyerDoorOpenEnv(tasks=tasks)

    env = FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=['state_observation']))

    env = TfEnv(NormalizedBoxEnv(env))

    policy = MAMLGaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        grad_step_size=fast_learning_rate,
        hidden_nonlinearity=tf.nn.relu,
        hidden_sizes=variant['hidden_sizes'],
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = MAMLTRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=fast_batch_size,  # number of trajs for grad update
        max_path_length=max_path_length,
        meta_batch_size=meta_batch_size,
        num_grad_updates=num_grad_updates,
        n_itr=1000,
        use_maml=True,
        step_size=meta_step_size,
        plot=False,
    )

    # import os

    # saveDir = variant['saveDir']

    # if os.path.isdir(saveDir)==False:
    #     os.mkdir(saveDir)

    # logger.set_snapshot_dir(saveDir)
    # #logger.set_snapshot_gap(20)
    # logger.add_tabular_output(saveDir+'progress.csv')

    algo.train()
Ejemplo n.º 23
0
def experiment(variant):

    seed = variant['seed']
    n_parallel = variant['n_parallel']
    log_dir = variant['log_dir']

    setup(seed, n_parallel, log_dir)
    expertDataLoc = variant['expertDataLoc']
    expertDataItr = variant['expertDataItr']

    fast_learning_rate = variant['flr']

    fast_batch_size = variant[
        'fbs']  # 10 works for [0.1, 0.2], 20 doesn't improve much for [0,0.2]
    meta_batch_size = 20  # 10 also works, but much less stable, 20 is fairly stable, 40 is more stable
    max_path_length = 150
    num_grad_updates = 1
    meta_step_size = variant['mlr']

    regionSize = variant['regionSize']

    if regionSize == '20X20':
        tasksFile = '/root/code/multiworld/multiworld/envs/goals/pickPlace_20X20_v1.pkl'

    else:
        assert regionSize == '60X30'
        tasksFile = '/root/code/multiworld/multiworld/envs/goals/PickPlace_60X30.pkl'

    tasks = pickle.load(open(tasksFile, 'rb'))
    envType = variant['envType']

    if envType == 'Push':
        baseEnv = SawyerPushEnv(tasks=tasks)
    else:
        assert (envType) == 'PickPlace'

        baseEnv = SawyerPickPlaceEnv(tasks=tasks)

    env = FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=['state_observation']))
    env = TfEnv(NormalizedBoxEnv(env))
    policy = MAMLGaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        grad_step_size=fast_learning_rate,
        hidden_nonlinearity=tf.nn.relu,
        hidden_sizes=variant['hidden_sizes'],
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = MAMLTRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=fast_batch_size,  # number of trajs for grad update
        max_path_length=max_path_length,
        meta_batch_size=meta_batch_size,
        num_grad_updates=num_grad_updates,
        n_itr=1000,
        use_maml=True,
        step_size=meta_step_size,
        plot=False,
        numExpertPolicies=20,
        expertDataInfo={
            'expert_loc': expertDataLoc,
            'expert_itr': expertDataItr
        })

    algo.train()
Ejemplo n.º 24
0
def experiment(variant, comet_exp_key=None):
    comet_logger = None
    if comet_exp_key is not None:
        # from rllab.misc.comet_logger import CometContinuedLogger, CometLogger
        # from comet_ml import Experiment, ExistingExperiment
        # comet_log = CometContinuedLogger(api_key="KWwx7zh6I2uw6oQMkpEo3smu0", previous_experiment_key=variant['comet_exp_key'])
        comet_logger = ExistingExperiment(
            api_key="KWwx7zh6I2uw6oQMkpEo3smu0",
            previous_experiment=variant['comet_exp_key'])
        # comet_log = CometLogger(api_key="KWwx7zh6I2uw6oQMkpEo3smu0",
        #                     project_name="ml4l3", workspace="glenb")
        comet_logger.set_name("test seq train")
        # comet_log = comet_exp_key
        print("RL!: ", comet_logger)
    print("%%%%%%%%%%%%%%%%%", comet_logger)
    seed = variant['seed']
    log_dir = variant['log_dir']
    n_parallel = variant['n_parallel']

    setup(seed, n_parallel, log_dir)

    init_file = variant['init_file']
    taskIndex = variant['taskIndex']
    n_itr = variant['n_itr']
    default_step = variant['default_step']
    policyType = variant['policyType']
    envType = variant['envType']

    tasksFile = path_to_multiworld + '/multiworld/envs/goals/' + variant[
        'tasksFile'] + '.pkl'
    tasks = pickle.load(open(tasksFile, 'rb'))

    max_path_length = variant['max_path_length']

    use_images = 'conv' in policyType
    print("$$$$$$$$$$$$$$$ RL-TASK: ", str(tasks[taskIndex]),
          " $$$$$$$$$$$$$$$")
    if 'MultiDomain' in envType:
        baseEnv = Sawyer_MultiDomainEnv(tasks=tasks,
                                        image=use_images,
                                        mpl=max_path_length)

    elif 'Push' in envType:
        baseEnv = SawyerPushEnv(tasks=tasks,
                                image=use_images,
                                mpl=max_path_length)

    elif 'PickPlace' in envType:
        baseEnv = SawyerPickPlaceEnv(tasks=tasks,
                                     image=use_images,
                                     mpl=max_path_length)

    elif 'Door' in envType:
        baseEnv = SawyerDoorOpenEnv(tasks=tasks,
                                    image=use_images,
                                    mpl=max_path_length)

    elif 'Ant' in envType:
        env = TfEnv(normalize(AntEnvRandGoalRing()))

    elif 'Coffee' in envType:
        baseEnv = SawyerCoffeeEnv(mpl=max_path_length)

    else:
        raise AssertionError('')

    if envType in ['Push', 'PickPlace', 'Door']:
        if use_images:
            obs_keys = ['img_observation']
        else:
            obs_keys = ['state_observation']
        env = TfEnv(
            NormalizedBoxEnv(
                FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=obs_keys),
                            reset_mode='idx')))

    baseline = ZeroBaseline(env_spec=env.spec)
    # baseline = LinearFeatureBaseline(env_spec = env.spec)
    batch_size = variant['batch_size']

    if policyType == 'fullAda_Bias':

        baseline = LinearFeatureBaseline(env_spec=env.spec)
        algo = vpg_fullADA(
            env=env,
            policy=None,
            load_policy=init_file,
            baseline=baseline,
            batch_size=batch_size,  # 2x
            max_path_length=max_path_length,
            n_itr=n_itr,
            # noise_opt = True,
            default_step=default_step,
            sampler_cls=VectorizedSampler,  # added by RK 6/19
            sampler_args=dict(n_envs=1),

            # reset_arg=np.asscalar(taskIndex),
            reset_arg=taskIndex,
            log_dir=log_dir,
            comet_logger=comet_logger,
            outer_iteration=variant['outer_iteration'])

    elif policyType == 'biasAda_Bias':

        algo = vpg_biasADA(
            env=env,
            policy=None,
            load_policy=init_file,
            baseline=baseline,
            batch_size=batch_size,  # 2x
            max_path_length=max_path_length,
            n_itr=n_itr,
            # noise_opt = True,
            default_step=default_step,
            sampler_cls=VectorizedSampler,  # added by RK 6/19
            sampler_args=dict(n_envs=1),
            # reset_arg=np.asscalar(taskIndex),
            reset_arg=taskIndex,
            log_dir=log_dir)

    elif policyType == 'basic':

        algo = vpg_basic(
            env=env,
            policy=None,
            load_policy=init_file,
            baseline=baseline,
            batch_size=batch_size,
            max_path_length=max_path_length,
            n_itr=n_itr,
            # step_size=10.0,
            sampler_cls=VectorizedSampler,  # added by RK 6/19
            sampler_args=dict(n_envs=1),
            reset_arg=taskIndex,
            optimizer=None,
            optimizer_args={
                'init_learning_rate': default_step,
                'tf_optimizer_args': {
                    'learning_rate': 0.5 * default_step
                },
                'tf_optimizer_cls': tf.train.GradientDescentOptimizer
            },
            log_dir=log_dir
            # extra_input="onehot_exploration", # added by RK 6/19
            # extra_input_dim=5, # added by RK 6/19
        )

    elif 'conv' in policyType:

        algo = vpg_conv(
            env=env,
            policy=None,
            load_policy=init_file,
            baseline=baseline,
            batch_size=batch_size,  # 2x
            max_path_length=max_path_length,
            n_itr=n_itr,
            sampler_cls=VectorizedSampler,  # added by RK 6/19
            sampler_args=dict(n_envs=1),
            # noise_opt = True,
            default_step=default_step,
            # reset_arg=np.asscalar(taskIndex),
            reset_arg=taskIndex,
            log_dir=log_dir)

    else:
        raise AssertionError(
            'Policy Type must be fullAda_Bias or biasAda_Bias')

    algo.train()
Ejemplo n.º 25
0
def experiment(variant):

    seed = variant['seed']
    n_parallel = 1
    log_dir = variant['log_dir']

    setup(seed, n_parallel, log_dir)

    fast_batch_size = variant['fbs']
    meta_batch_size = variant['mbs']
    adam_steps = variant['adam_steps']
    max_path_length = variant['max_path_length']

    dagger = variant['dagger']
    expert_policy_loc = variant['expert_policy_loc']

    ldim = variant['ldim']
    init_flr = variant['init_flr']
    policyType = variant['policyType']
    use_maesn = variant['use_maesn']
    EXPERT_TRAJ_LOCATION = variant['expertDataLoc']
    envType = variant['envType']

    tasksFile = path_to_multiworld + 'multiworld/envs/goals/' + variant[
        'tasksFile'] + '.pkl'

    all_tasks = pickle.load(open(tasksFile, 'rb'))
    assert meta_batch_size <= len(all_tasks)
    tasks = all_tasks[:meta_batch_size]

    use_images = 'conv' in policyType

    if 'Push' == envType:
        baseEnv = SawyerPushEnv(tasks=tasks,
                                image=use_images,
                                mpl=max_path_length)

    elif envType == 'sparsePush':
        baseEnv = SawyerPushEnv(tasks=tasks,
                                image=use_images,
                                mpl=max_path_length,
                                rewMode='l2Sparse')

    elif 'PickPlace' in envType:
        baseEnv = SawyerPickPlaceEnv(tasks=tasks,
                                     image=use_images,
                                     mpl=max_path_length)

    elif 'Door' in envType:
        baseEnv = SawyerDoorOpenEnv(tasks=tasks,
                                    image=use_images,
                                    mpl=max_path_length)

    elif 'Ant' in envType:
        env = TfEnv(normalize(AntEnvRandGoalRing()))

    elif 'claw' in envType:
        env = TfEnv(DClawScrewRandGoal())

    else:
        assert True == False

    if envType in ['Push', 'PickPlace', 'Door']:
        if use_images:
            obs_keys = ['img_observation']
        else:
            obs_keys = ['state_observation']
        env = TfEnv(
            NormalizedBoxEnv(
                FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=obs_keys),
                            reset_mode='idx')))

    algoClass = MAMLIL
    baseline = LinearFeatureBaseline(env_spec=env.spec)

    load_policy = variant['load_policy']

    if load_policy != None:
        policy = None
        load_policy = variant['load_policy']
        # if 'conv' in load_policy:
        #     baseline = ZeroBaseline(env_spec=env.spec)

    elif 'fullAda_Bias' in policyType:

        policy = fullAda_Bias_policy(name="policy",
                                     env_spec=env.spec,
                                     grad_step_size=init_flr,
                                     hidden_nonlinearity=tf.nn.relu,
                                     hidden_sizes=(100, 100),
                                     init_flr_full=init_flr,
                                     latent_dim=ldim)

    elif 'biasAda_Bias' in policyType:

        policy = biasAda_Bias_policy(name="policy",
                                     env_spec=env.spec,
                                     grad_step_size=init_flr,
                                     hidden_nonlinearity=tf.nn.relu,
                                     hidden_sizes=(100, 100),
                                     init_flr_full=init_flr,
                                     latent_dim=ldim)

    elif 'basic' in policyType:
        policy = basic_policy(
            name="policy",
            env_spec=env.spec,
            grad_step_size=init_flr,
            hidden_nonlinearity=tf.nn.relu,
            hidden_sizes=(100, 100),
            extra_input_dim=(0 if extra_input is "" else extra_input_dim),
        )

    elif 'conv' in policyType:

        baseline = ZeroBaseline(env_spec=env.spec)

        policy = conv_policy(
            name="policy",
            latent_dim=ldim,
            policyType=policyType,
            env_spec=env.spec,
            init_flr=init_flr,
            hidden_nonlinearity=tf.nn.relu,
            hidden_sizes=(100, 100),
            extra_input_dim=(0 if extra_input is "" else extra_input_dim),
        )

    algo = algoClass(
        env=env,
        policy=policy,
        load_policy=load_policy,
        baseline=baseline,
        batch_size=fast_batch_size,  # number of trajs for alpha grad update
        max_path_length=max_path_length,
        meta_batch_size=
        meta_batch_size,  # number of tasks sampled for beta grad update
        num_grad_updates=num_grad_updates,  # number of alpha grad updates
        n_itr=1,  #100
        make_video=False,
        use_maml=True,
        use_pooled_goals=True,
        use_corr_term=use_corr_term,
        test_on_training_goals=test_on_training_goals,
        metalearn_baseline=False,
        # metalearn_baseline=False,
        limit_demos_num=limit_demos_num,
        test_goals_mult=1,
        step_size=meta_step_size,
        plot=False,
        beta_steps=beta_steps,
        adam_curve=None,
        adam_steps=adam_steps,
        pre_std_modifier=pre_std_modifier,
        l2loss_std_mult=l2loss_std_mult,
        importance_sampling_modifier=MOD_FUNC[''],
        post_std_modifier=post_std_modifier,
        expert_trajs_dir=EXPERT_TRAJ_LOCATION,
        expert_trajs_suffix='',
        seed=seed,
        extra_input=extra_input,
        extra_input_dim=(0 if extra_input is "" else extra_input_dim),
        plotDirPrefix=None,
        latent_dim=ldim,
        dagger=dagger,
        expert_policy_loc=expert_policy_loc)

    algo.train()
Ejemplo n.º 26
0
num_imSteps = 50
use_maml = True

ratio = '_5_1'
expertDataLoc = '/home/russellm/mri_onPolicy/expertPolicyWeights/TRPO-push-20X20-v1/'
expertDataItr = 300

for meta_batch_size in meta_batch_sizes:
    for fast_learning_rate in fast_learning_rates:
        for fast_batch_size in fast_batch_sizes:

            stub(globals())

            baseEnv = SawyerPushEnv(tasks=None)
            env = FinnMamlEnv(
                FlatGoalEnv(baseEnv, obs_keys=['state_observation']))
            env = TfEnv(NormalizedBoxEnv(env))

            policy = MAMLGaussianMLPPolicy(
                name="policy",
                env_spec=env.spec,
                grad_step_size=fast_learning_rate,
                hidden_nonlinearity=tf.nn.relu,
                hidden_sizes=(100, 100),
            )

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            algo = MAMLTRPO(
                env=env,
                policy=policy,
Ejemplo n.º 27
0
 def action_space(self):
     return FlatGoalEnv.action_space(self)
Ejemplo n.º 28
0
    parser.add_argument('--use_tensorboard', action='store_true')
    parser.add_argument('--logdir', type=str, default='./logs/ddpg_test')
    parser.add_argument('--exp_name', type=str, default='evaluate')
    parser.add_argument('--env', type=str, default='SawyerReachXYEnv-v1')
    args = parser.parse_args()

    _, get_action = load_policy(args.saved_model,
                                  args.itr if args.itr >= 0 else 'last',
                                  args.deterministic)
    tensor_board = None
    env = SawyerReachXYZEnv(
            action_mode='position',
            position_action_scale=0.1,
            config_name='austri_config',
            reset_free=False,
            max_speed=0.05,
            fix_goal=False,
            fixed_goal=(0.53,0.0,0.15)
        )

    env = FlatGoalEnv(env, append_goal_to_obs=True)
    env.reset()
    logdir_ext = os.path.join(args.logdir + '_' + args.env + '_evaluate')
    if not os.path.exists(logdir_ext):
        os.mkdir(logdir_ext)

    if args.use_tensorboard:
        tensor_board = SummaryWriter(logdir_ext)

    logger_kwargs = setup_logger_kwargs(exp_name=args.exp_name, data_dir=logdir_ext)
    run_policy(env, get_action, args.len, args.episodes, args.render, tensor_board)