def __init__(self, *args, **kwargs): self.quick_init(locals()) sawyer_env = SawyerEnv(*args, **kwargs) FlatGoalEnv.__init__(self, sawyer_env, obs_keys=['state_observation'], goal_keys=['state_desired_goal'])
def __init__(self, *args, **kwargs): self.quick_init(locals()) sawyer_env = SawyerEnv( obj_low=(-0.0, 0.5, 0.02), obj_high=(0.0, 0.5, 0.02), goal_low=(-0.2, 0.6, 0.02), goal_high=(0.2, 0.8, 0.02), rew_mode='posPlace', *args, **kwargs) FlatGoalEnv.__init__(self, sawyer_env, obs_keys=['state_observation'], goal_keys=['state_desired_goal'])
def build_env(env_id): assert (env_id is not ""), "Unspecified environment." env = gym.make(env_id) if env_id == "SawyerPushAndReachEnvEasy-v0": env = FlatGoalEnv(ImageEnv(env, transpose=True), obs_keys=['image_observation'], append_goal_to_obs=True) env._max_episode_steps = 50 return env
def run_sac(base_expl_env, base_eval_env, variant): expl_env = FlatGoalEnv(base_expl_env, append_goal_to_obs=True) eval_env = FlatGoalEnv(base_eval_env, append_goal_to_obs=True) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant["layer_size"] num_hidden = variant["num_hidden_layers"] qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M] * num_hidden) qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M] * num_hidden) target_qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M] * num_hidden) target_qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M] * num_hidden) policy = TanhGaussianPolicy(obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M] * num_hidden) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) replay_buffer = EnvReplayBuffer( variant["replay_buffer_size"], expl_env, ) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant["trainer_kwargs"]) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant["algorithm_kwargs"]) algorithm.train()
def train_distilled_policy(num_tasks, policies=None, epochs_per_task=500, batch_size=100, lr=1e-3): """ Trains a distilled policy (using an optimal expert or a trained expert). Saves the policy in a .pkl file along with the env and the loss history. :param num_tasks: Number of tasks/policies to distill. :param policies: A list of length `num_tasks` containing all the individual experts. :param epochs_per_task: Number of training epochs per task. :param batch_size: Batch sample size per update step. :param lr: Learning rate of the optimizer. :return: The trained policy and the environment. """ base_env = PointMassEnv(n=num_tasks) env = FlatGoalEnv(base_env, append_goal_to_obs=True) obs_dim = env.observation_space.low.size act_dim = env.action_space.low.size policy = TanhMlpPolicy(input_size=obs_dim, output_size=act_dim, hidden_sizes=[64, 64] # hidden_sizes=[64, 64, 64] ) loss_history = [] criterion = nn.MSELoss() optim = Adam(policy.parameters(), lr=lr) for epoch in range(epochs_per_task * num_tasks): if policies: assert len(policies) == num_tasks, "Number of expert policies needs " \ "to be equal to the number of tasks" obs, act_labels = get_batch(env, batch_size, policies) obs_var, act_labels_var = Variable(torch.from_numpy(obs)).float(), \ Variable(torch.from_numpy(act_labels)).float() acts = policy(obs_var) optim.zero_grad() loss = criterion(acts, act_labels_var) loss.backward() optim.step() loss_val = loss.data.item() loss_history.append(loss_val) if epoch % 50 == 0: print("epoch: {0} \t loss: {1}".format(epoch, loss_val)) print("FINAL loss: {1}".format(epoch, loss.data.item())) out = dict(policy=policy, env=env, loss_history=loss_history) appended_path = "-from_expert_policies" if policies else "" path = "./logs/policy-distillation/model-{0}{1}.pkl".format( num_tasks, appended_path) with open(path, "wb") as f: pickle.dump(out, f, protocol=pickle.HIGHEST_PROTOCOL) return policy, env
def flatten_multiworld_env(env): from multiworld.core.flat_goal_env import FlatGoalEnv flat_env = FlatGoalEnv(env, obs_keys=['image_observation'], goal_keys=['image_desired_goal'], append_goal_to_obs=True) env = GymAdapter(env=flat_env) return env
def __init__(self, use_hand_cam=False): self.use_hand_cam = use_hand_cam # Transformation matrix from camera's frame -> based frame # self.TRANSFORMATION_MATRIX = np.array([[0.11491126, 0.88002959, -0.46080724, 1.0704251219017176], # [0.99326509, -0.0948642, 0.06652247, 0.02981537521689703], # [0.01482763, -0.46534793, -0.88500364, 0.6268248987975156], # [0., 0., 0., 1.]]) self.TRANSFORMATION_MATRIX = np.array( [[-0.15316623, 0.86485568, -0.47808446, 1.06231099], [0.97058596, 0.22259649, 0.09172615, -0.08591922], [0.18574981, -0.44997272, -0.87351105, 0.62519807], [0., 0., 0., 1.]]) self.angle_defaul_cam = [ -0.9347021484375, -0.066611328125, -2.09948828125, -2.4536884765625, -1.90233984375, -2.909759765625, -2.622689453125 ] self.angle_init_for_grasp = [ 0.51219827, -0.35472363, -0.69057131, 1.43175006, -2.19978213, -0.83249319, -1.90052831 ] self.angle_for_place_object = [ -0.34514549, 0.24693164, -1.2170068, 1.22242475, 1.65923345, 1.15603614, 0.06596191 ] self.msg_close = True self.msg_open = False env = SawyerReachXYZEnv( action_mode='position', position_action_scale=0.1, config_name='austri_config', reset_free=False, max_speed=0.05, fix_goal=True, ) self.env = FlatGoalEnv(env, append_goal_to_obs=True) os.system('clear') print('[AIM-INFO] Initializing robotic grasping...') for _ in range(5): self.move_to_angle(angle=self.angle_init_for_grasp, duration=2) print('[AIM-INFO] Initialize done.')
def experiment(variant): env = Point2DEnv(**variant['env_kwargs']) env = FlatGoalEnv(env) env = NormalizedBoxEnv(env) action_dim = int(np.prod(env.action_space.shape)) obs_dim = int(np.prod(env.observation_space.shape)) qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhGaussianPolicy(obs_dim=obs_dim, action_dim=action_dim, **variant['policy_kwargs']) eval_env = expl_env = env eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = TwinSACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, data_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def create_image_48_sawyer_push_forward_v0(): from multiworld.core.flat_goal_env import FlatGoalEnv from multiworld.core.image_env import ImageEnv from multiworld.envs.mujoco.cameras import sawyer_pusher_camera_upright_v2 image_env = ImageEnv( wrapped_env=gym.make('BaseSawyerPushForwardEnv-v0'), imsize=48, init_camera=sawyer_pusher_camera_upright_v2, normalize=True, ) return FlatGoalEnv(image_env, obs_keys=['image_observation'])
def create_image_48_sawyer_pick_and_place_v0(): from multiworld.core.flat_goal_env import FlatGoalEnv from multiworld.core.image_env import ImageEnv from multiworld.envs.mujoco.cameras import sawyer_pick_and_place_camera_zoomed wrapped_env = gym.make('BaseSawyerPickAndPlaceEnv-v0') state_desired_goal = wrapped_env.fixed_goal goal_dim = len(state_desired_goal) imsize = 48 image_env = ImageEnv( wrapped_env=wrapped_env, imsize=imsize, init_camera=sawyer_pick_and_place_camera_zoomed, normalize=True, presampled_goals={'state_desired_goal': state_desired_goal.reshape(1,goal_dim), 'image_desired_goal': np.zeros((1, imsize*imsize*3))}, ) return FlatGoalEnv(image_env, obs_keys=['image_observation'])
def run_task(*_): env = FlatGoalEnv(SawyerPickEnv(), obs_keys=["state_observation"]) env = TfEnv(normalize(env)) policy = GaussianMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=500, n_itr=500, discount=0.99, step_size=0.01, plot=True) algo.train()
def run_task(*_): with LocalRunner() as runner: env = FlatGoalEnv(SawyerReachXYZEnv(), obs_keys=["state_observation"]) env = TfEnv(normalize(env)) policy = GaussianMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, max_path_length=500, discount=0.99, max_kl_step=0.01) runner.setup(algo, env) runner.train(n_epochs=500, batch_size=4000, plot=True)
def create_image_48_sawyer_door_pull_hook_v0(): from multiworld.core.flat_goal_env import FlatGoalEnv from multiworld.core.image_env import ImageEnv from multiworld.envs.mujoco.cameras import sawyer_door_env_camera_v0 import numpy as np wrapped_env = gym.make('BaseSawyerDoorHookEnv-v0') imsize=48 imsize_flat=imsize*imsize*3 image_env = ImageEnv( wrapped_env=wrapped_env, imsize=imsize, init_camera=sawyer_door_env_camera_v0, normalize=True, presampled_goals={ 'state_desired_goal': np.expand_dims(wrapped_env.fixed_goal, axis=0), 'image_desired_goal': np.zeros((1, imsize_flat))}, non_presampled_goal_img_is_garbage=True, ) return FlatGoalEnv(image_env, obs_keys=['image_observation'])
def __init__(self, steps_needed_to_solve, planning_horizon, task_horizon_factor=2): env = gym.make("PointmassUWallTrainEnvBig-v1") env.action_scale = self.PATH_LENGTH_TO_SOLVE / steps_needed_to_solve env = FlatGoalEnv(env, append_goal_to_obs=True) PointmassUWallConfigModule.TASK_HORIZON = int(task_horizon_factor * steps_needed_to_solve) PointmassUWallConfigModule.PLAN_HOR = planning_horizon PointmassUWallConfigModule.NROLLOUTS_PER_ITER = math.ceil( PointmassUWallConfigModule.NUM_STEPS_TOTAL / (PointmassUWallConfigModule.TASK_HORIZON * PointmassUWallConfigModule.NTRAIN_ITERS)) print('-------------') print("task horizon", PointmassUWallConfigModule.TASK_HORIZON) print("plan horizon", PointmassUWallConfigModule.PLAN_HOR) print("nrolls per iter", PointmassUWallConfigModule.NROLLOUTS_PER_ITER) print("action_scale", env.wrapped_env.action_scale) print('-------------') self.ENV = env cfg = tf.ConfigProto() cfg.gpu_options.allow_growth = True self.SESS = tf.Session(config=cfg) self.NN_TRAIN_CFG = {"epochs": 2} self.OPT_CFG = { "Random": { "popsize": 10 }, "CEM": { "popsize": 5, "num_elites": 2, "max_iters": 2, "alpha": 0.1, } } self.UPDATE_FNS = []
def experiment(variant, comet_exp_key=None): if comet_exp_key is not None: from rllab.misc.comet_logger import CometContinuedLogger, CometLogger from comet_ml import Experiment, ExistingExperiment # comet_log = CometContinuedLogger(api_key="KWwx7zh6I2uw6oQMkpEo3smu0", previous_experiment_key=variant['comet_exp_key']) comet_log = ExistingExperiment(api_key="KWwx7zh6I2uw6oQMkpEo3smu0", previous_experiment=variant['comet_exp_key']) # comet_log = CometLogger(api_key="KWwx7zh6I2uw6oQMkpEo3smu0", # project_name="ml4l3", workspace="glenb") comet_log.set_name("test seq train") # comet_log = comet_exp_key print (comet_log) else: comet_log = None print ("loading libraries") from sandbox.rocky.tf.algos.maml_il import MAMLIL from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.baselines.gaussian_mlp_baseline import GaussianMLPBaseline from rllab.baselines.maml_gaussian_mlp_baseline import MAMLGaussianMLPBaseline from rllab.baselines.zero_baseline import ZeroBaseline from rllab.envs.normalized_env import normalize from rllab.misc.instrument import stub, run_experiment_lite from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy import MAMLGaussianMLPPolicy as basic_policy # from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_adaptivestep import MAMLGaussianMLPPolicy as fullAda_basic_policy from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_adaptivestep_ppo import \ MAMLGaussianMLPPolicy as PPO_policy from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_adaptivestep_biastransform import \ MAMLGaussianMLPPolicy as fullAda_Bias_policy from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_biasonlyadaptivestep_biastransform import \ MAMLGaussianMLPPolicy as biasAda_Bias_policy from sandbox.rocky.tf.policies.maml_minimal_conv_gauss_mlp_policy import MAMLGaussianMLPPolicy as conv_policy from sandbox.rocky.tf.optimizers.quad_dist_expert_optimizer import QuadDistExpertOptimizer from sandbox.rocky.tf.optimizers.first_order_optimizer import FirstOrderOptimizer from sandbox.rocky.tf.envs.base import TfEnv import sandbox.rocky.tf.core.layers as L from rllab.envs.mujoco.ant_env_rand_goal_ring import AntEnvRandGoalRing from multiworld.envs.mujoco.sawyer_xyz.push.sawyer_push import SawyerPushEnv from multiworld.envs.mujoco.sawyer_xyz.pickPlace.sawyer_pick_and_place import SawyerPickPlaceEnv from multiworld.envs.mujoco.sawyer_xyz.door.sawyer_door_open import SawyerDoorOpenEnv from multiworld.core.flat_goal_env import FlatGoalEnv from multiworld.core.finn_maml_env import FinnMamlEnv from multiworld.core.wrapper_env import NormalizedBoxEnv import tensorflow as tf import time from rllab.envs.gym_env import GymEnv from maml_examples.maml_experiment_vars import MOD_FUNC import numpy as np import random as rd import pickle print ("Done loading libraries") seed = variant['seed']; n_parallel = 1; log_dir = variant['log_dir'] x=0 setup(seed, n_parallel, log_dir) fast_batch_size = variant['fbs']; meta_batch_size = variant['mbs'] adam_steps = variant['adam_steps']; max_path_length = variant['max_path_length'] dagger = variant['dagger']; expert_policy_loc = variant['expert_policy_loc'] ldim = variant['ldim']; init_flr = variant['init_flr']; policyType = variant['policyType']; use_maesn = variant['use_maesn'] EXPERT_TRAJ_LOCATION = variant['expertDataLoc'] envType = variant['envType'] tasksFile = path_to_multiworld + 'multiworld/envs/goals/' + variant['tasksFile'] + '.pkl' all_tasks = pickle.load(open(tasksFile, 'rb')) assert meta_batch_size <= len(all_tasks), "meta batch size wrong: " + str(meta_batch_size) + " <= " + str(len(all_tasks)) tasks = all_tasks[:meta_batch_size] print("^^^^^^^^^^^^^^^^ meta_tasks: ", tasks, " ^^^^^^^^^^^^^^^^ ") use_images = 'conv' in policyType if 'Push' == envType: baseEnv = SawyerPushEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif envType == 'sparsePush': baseEnv = SawyerPushEnv(tasks=tasks, image=use_images, mpl=max_path_length, rewMode='l2Sparse') elif 'PickPlace' in envType: baseEnv = SawyerPickPlaceEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'Door' in envType: baseEnv = SawyerDoorOpenEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'Ant' in envType: env = TfEnv(normalize(AntEnvRandGoalRing())) elif 'claw' in envType: env = TfEnv(DClawScrewRandGoal()) else: assert True == False if envType in ['Push', 'PickPlace', 'Door']: if use_images: obs_keys = ['img_observation'] else: obs_keys = ['state_observation'] env = TfEnv(NormalizedBoxEnv(FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=obs_keys), reset_mode='idx'))) algoClass = MAMLIL baseline = LinearFeatureBaseline(env_spec=env.spec) load_policy = variant['load_policy'] if load_policy != None: policy = None load_policy = variant['load_policy'] # if 'conv' in load_policy: # baseline = ZeroBaseline(env_spec=env.spec) elif 'fullAda_PPO' in policyType: policy = PPO_policy( name="policy", env_spec=env.spec, grad_step_size=init_flr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), init_flr_full=init_flr, latent_dim=ldim ) elif 'fullAda_Bias' in policyType: policy = fullAda_Bias_policy( name="policy", env_spec=env.spec, grad_step_size=init_flr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), init_flr_full=init_flr, latent_dim=ldim ) elif 'biasAda_Bias' in policyType: policy = biasAda_Bias_policy( name="policy", env_spec=env.spec, grad_step_size=init_flr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), init_flr_full=init_flr, latent_dim=ldim ) elif 'basic' in policyType: policy = basic_policy( name="policy", env_spec=env.spec, grad_step_size=init_flr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), extra_input_dim=(0 if extra_input is "" else extra_input_dim), ) elif 'conv' in policyType: baseline = ZeroBaseline(env_spec=env.spec) policy = conv_policy( name="policy", latent_dim=ldim, policyType=policyType, env_spec=env.spec, init_flr=init_flr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), extra_input_dim=(0 if extra_input is "" else extra_input_dim), ) print("|||||||||||||||||||||||||||||||||||||||||||||||", variant['n_itr']) beta_steps = 1 ; meta_step_size = 0.01 ; num_grad_updates = 1 pre_std_modifier = 1.0 ; post_std_modifier = 0.00001 limit_demos_num = None algo = algoClass( env=env, policy=policy, load_policy=load_policy, baseline=baseline, batch_size=fast_batch_size, # number of trajs for alpha grad update max_path_length=max_path_length, meta_batch_size=meta_batch_size, # number of tasks sampled for beta grad update num_grad_updates=num_grad_updates, # number of alpha grad updates n_itr=variant['n_itr'], make_video=False, use_maml=True, use_pooled_goals=True, use_corr_term=use_corr_term, test_on_training_goals=test_on_training_goals, metalearn_baseline=False, # metalearn_baseline=False, limit_demos_num=limit_demos_num, test_goals_mult=1, step_size=meta_step_size, plot=False, beta_steps=beta_steps, adam_curve=None, adam_steps=adam_steps, pre_std_modifier=pre_std_modifier, l2loss_std_mult=l2loss_std_mult, importance_sampling_modifier=MOD_FUNC[''], post_std_modifier=post_std_modifier, expert_trajs_dir=EXPERT_TRAJ_LOCATION, expert_trajs_suffix='', seed=seed, extra_input=extra_input, extra_input_dim=(0 if extra_input is "" else extra_input_dim), plotDirPrefix=None, latent_dim=ldim, dagger=dagger, expert_policy_loc=expert_policy_loc, comet_logger=comet_log, outerIteration=variant['outer_Iteration'], use_ppo=True ) algo.train()
def __init__(self): super().__init__() env = gym.make('Point2DFixedGoalEnv-v0') env = FlatGoalEnv(env, append_goal_to_obs=False) self.ENV = env
from rllab.envs.normalized_env import normalize from rllab.misc.instrument import stub, run_experiment_lite from multiworld.envs.mujoco.sawyer_xyz.push.sawyer_push import SawyerPushEnv from sandbox.rocky.tf.envs.base import TfEnv from multiworld.core.flat_goal_env import FlatGoalEnv from multiworld.core.finn_maml_env import FinnMamlEnv from multiworld.core.wrapper_env import NormalizedBoxEnv stub(globals()) rate = 0.01 mode = 'local' import tensorflow as tf for goal in range(1, 100): baseEnv = FlatGoalEnv(SawyerPushEnv(tasks=None), obs_keys=['state_observation']) env = TfEnv(NormalizedBoxEnv(FinnMamlEnv(baseEnv, reset_mode='task'))) #env = WheeledEnvGoal() env = TfEnv(env) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=20000,
def encoder_wrapped_env(variant): representation_size = 128 output_classes = 20 model_class = variant.get('model_class', TimestepPredictionModel) model = model_class( representation_size, # decoder_output_activation=decoder_activation, output_classes=output_classes, **variant['model_kwargs'], ) # model = torch.nn.DataParallel(model) model_path = variant.get("model_path") # model = load_local_or_remote_file(model_path) state_dict = torch.load(model_path) model.load_state_dict(state_dict) model.to(ptu.device) model.eval() traj = np.load(variant.get("desired_trajectory"), allow_pickle=True)[0] goal_image = traj["observations"][-1]["image_observation"] goal_image = goal_image.reshape(1, 3, 500, 300).transpose([0, 1, 3, 2]) / 255.0 # goal_image = goal_image.reshape(1, 300, 500, 3).transpose([0, 3, 1, 2]) / 255.0 # BECAUSE RLBENCH DEMOS ARENT IMAGE_ENV WRAPPED # goal_image = goal_image[:, :, :240, 60:500] goal_image = goal_image[:, :, 60:, 60:500] goal_image_pt = ptu.from_numpy(goal_image) save_image(goal_image_pt.data.cpu(), 'gitignore/goal.png', nrow=1) goal_latent = model.encode(goal_image_pt).detach().cpu().numpy().flatten() initial_image = traj["observations"][0]["image_observation"] initial_image = initial_image.reshape(1, 3, 500, 300).transpose([0, 1, 3, 2]) / 255.0 # initial_image = initial_image.reshape(1, 300, 500, 3).transpose([0, 3, 1, 2]) / 255.0 # initial_image = initial_image[:, :, :240, 60:500] initial_image = initial_image[:, :, 60:, 60:500] initial_image_pt = ptu.from_numpy(initial_image) save_image(initial_image_pt.data.cpu(), 'gitignore/initial.png', nrow=1) initial_latent = model.encode(initial_image_pt).detach().cpu().numpy().flatten() # Move these to td3_bc and bc_v3 (or at least type for reward_params) reward_params = dict( goal_latent=goal_latent, initial_latent=initial_latent, type=variant["reward_params_type"], ) config_params = variant.get("config_params") env = variant['env_class'](**variant['env_kwargs']) env = ImageEnv(env, recompute_reward=False, transpose=True, image_length=450000, reward_type="image_distance", # init_camera=sawyer_pusher_camera_upright_v2, ) env = EncoderWrappedEnv( env, model, reward_params, config_params, **variant.get("encoder_wrapped_env_kwargs", dict()) ) env = FlatGoalEnv(env, obs_keys=["state_observation", ]) return env
def _pointmass_fixed_goal_experiment(vae_latent_size, replay_buffer_size, cnn_kwargs, vae_kwargs, policy_kwargs, qf_kwargs, e2e_trainer_kwargs, sac_trainer_kwargs, algorithm_kwargs, eval_path_collector_kwargs=None, expl_path_collector_kwargs=None, **kwargs): if expl_path_collector_kwargs is None: expl_path_collector_kwargs = {} if eval_path_collector_kwargs is None: eval_path_collector_kwargs = {} from multiworld.core.image_env import ImageEnv from multiworld.envs.pygame.point2d import Point2DEnv from multiworld.core.flat_goal_env import FlatGoalEnv env = Point2DEnv( images_are_rgb=True, render_onscreen=False, show_goal=False, ball_radius=2, render_size=48, fixed_goal=(0, 0), ) env = ImageEnv(env, imsize=env.render_size, transpose=True, normalize=True) env = FlatGoalEnv(env) #, append_goal_to_obs=True) input_width, input_height = env.image_shape action_dim = int(np.prod(env.action_space.shape)) vae = ConvVAE( representation_size=vae_latent_size, input_channels=3, imsize=input_width, decoder_output_activation=nn.Sigmoid(), # decoder_distribution='gaussian_identity_variance', **vae_kwargs) encoder = Vae2Encoder(vae) def make_cnn(): return networks.CNN(input_width=input_width, input_height=input_height, input_channels=3, output_conv_channels=True, output_size=None, **cnn_kwargs) def make_qf(): return networks.MlpQfWithObsProcessor(obs_processor=nn.Sequential( encoder, networks.Flatten(), ), output_size=1, input_size=action_dim + vae_latent_size, **qf_kwargs) qf1 = make_qf() qf2 = make_qf() target_qf1 = make_qf() target_qf2 = make_qf() action_dim = int(np.prod(env.action_space.shape)) policy_cnn = make_cnn() policy = TanhGaussianPolicyAdapter( nn.Sequential(policy_cnn, networks.Flatten()), policy_cnn.conv_output_flat_size, action_dim, **policy_kwargs) eval_env = expl_env = env eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector(eval_env, eval_policy, **eval_path_collector_kwargs) replay_buffer = EnvReplayBuffer( replay_buffer_size, expl_env, ) vae_trainer = VAETrainer(vae) sac_trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **sac_trainer_kwargs) trainer = End2EndSACTrainer( sac_trainer=sac_trainer, vae_trainer=vae_trainer, **e2e_trainer_kwargs, ) expl_path_collector = MdpPathCollector(expl_env, policy, **expl_path_collector_kwargs) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **algorithm_kwargs) algorithm.to(ptu.device) algorithm.train()
class ObjectGrasping: """ This class provides the utilities to grasp object including: - Find object's position based on /ar_pose_marker service - Move joints to specific angles - Move end-effector to specific position by using policy learned by RL algorithm """ def __init__(self, use_hand_cam=False): self.use_hand_cam = use_hand_cam # Transformation matrix from camera's frame -> based frame # self.TRANSFORMATION_MATRIX = np.array([[0.11491126, 0.88002959, -0.46080724, 1.0704251219017176], # [0.99326509, -0.0948642, 0.06652247, 0.02981537521689703], # [0.01482763, -0.46534793, -0.88500364, 0.6268248987975156], # [0., 0., 0., 1.]]) self.TRANSFORMATION_MATRIX = np.array( [[-0.15316623, 0.86485568, -0.47808446, 1.06231099], [0.97058596, 0.22259649, 0.09172615, -0.08591922], [0.18574981, -0.44997272, -0.87351105, 0.62519807], [0., 0., 0., 1.]]) self.angle_defaul_cam = [ -0.9347021484375, -0.066611328125, -2.09948828125, -2.4536884765625, -1.90233984375, -2.909759765625, -2.622689453125 ] self.angle_init_for_grasp = [ 0.51219827, -0.35472363, -0.69057131, 1.43175006, -2.19978213, -0.83249319, -1.90052831 ] self.angle_for_place_object = [ -0.34514549, 0.24693164, -1.2170068, 1.22242475, 1.65923345, 1.15603614, 0.06596191 ] self.msg_close = True self.msg_open = False env = SawyerReachXYZEnv( action_mode='position', position_action_scale=0.1, config_name='austri_config', reset_free=False, max_speed=0.05, fix_goal=True, ) self.env = FlatGoalEnv(env, append_goal_to_obs=True) os.system('clear') print('[AIM-INFO] Initializing robotic grasping...') for _ in range(5): self.move_to_angle(angle=self.angle_init_for_grasp, duration=2) print('[AIM-INFO] Initialize done.') def go_to_camera_view_position(self): duration = 2 self.move_to_angle(self.angle_defaul_cam, duration) def go_to_place_position(self): duration = 7 self.move_to_angle(self.angle_for_place_object, duration) def move_to_angle(self, angle, duration): rospy.wait_for_service('angle_action') try: execute_action = rospy.ServiceProxy('angle_action', angle_action, persistent=True) execute_action(angle, duration) return None except rospy.ServiceException as e: print('[AIM-ERROR] Error when moving to angle: ', angle) def locate_object(self): service_name = "/locate_object" service = rospy.ServiceProxy(service_name, target) service.wait_for_service() print("[AIM-INFO] Connect to service {} successfully.".format( service_name)) while True: req = targetRequest() req.data = 0 resp = service.call(req) if resp.pose is not (): print('[AIM-INFO] Object detected') break elif self.use_hand_cam: print('[AIM-INFO] Cannot detect object...') self.go_to_camera_view_position() else: print('[AIM-INFO] Cannot detect object...') return resp.pose def get_object_location(self): obj_pos_cam_frame = self.locate_object() # w.r.t. camera frame print( "[AIM-DEBUG] Object in camera frame: (%.4f, %.4f, %.4f)" % (obj_pos_cam_frame[0], obj_pos_cam_frame[1], obj_pos_cam_frame[2])) if self.use_hand_cam: obj_pos_based_frame = list(obj_pos_cam_frame) else: obj_pos_homo = np.hstack([obj_pos_cam_frame, 1]) obj_pos_based_frame = np.matmul(self.TRANSFORMATION_MATRIX, obj_pos_homo) print("[AIM-DEBUG] Object in based frame: (%.4f, %.4f, %.4f)" % (obj_pos_based_frame[0], obj_pos_based_frame[1], obj_pos_based_frame[2])) obj_pos_based_frame[2] = obj_pos_based_frame[2] + 0.15 print( "[AIM-DEBUG] Object in based frame with offset: (%.4f, %.4f, %.4f)" % (obj_pos_based_frame[0], obj_pos_based_frame[1], obj_pos_based_frame[2])) return list(obj_pos_based_frame[:3]) def request_grasp(self, data): rospy.wait_for_service('grasping') execute_action = rospy.ServiceProxy('grasping', grasping, persistent=True) execute_action(data) def move_to_pos(self, goal): self.env.wrapped_env._state_goal = np.array(goal) print('[AIM-INFO] Moving to reset position...') for _ in range(5): self.env.reset() print('[AIM-INFO] Starting move to target position...') run_policy(self.env, get_action, 15, 1, False, grasp=True)
def experiment(variant): base_expl_env = PointMassEnv(n=variant["num_tasks"], reward_type=variant["reward_type"]) expl_env = FlatGoalEnv(base_expl_env, append_goal_to_obs=True) base_eval_env = PointMassEnv(n=variant["num_tasks"], reward_type=variant["reward_type"]) eval_env = FlatGoalEnv(base_eval_env, append_goal_to_obs=True) obs_dim = expl_env.observation_space.low.size action_dim = expl_env.action_space.low.size print(expl_env.observation_space, expl_env.action_space) qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) target_policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) es = GaussianStrategy( action_space=expl_env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) eval_path_collector = MdpPathCollector( eval_env, policy, ) expl_path_collector = MdpPathCollector( expl_env, exploration_policy, ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = TD3Trainer(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.train()
def experiment(variant): seed = variant['seed'] n_parallel = variant['n_parallel'] log_dir = variant['log_dir'] setup(seed, n_parallel, log_dir) fast_learning_rate = variant['flr'] fast_batch_size = variant[ 'fbs'] # 10 works for [0.1, 0.2], 20 doesn't improve much for [0,0.2] meta_batch_size = 20 # 10 also works, but much less stable, 20 is fairly stable, 40 is more stable max_path_length = 150 num_grad_updates = 1 meta_step_size = variant['mlr'] tasksFile = '/root/code/multiworld/multiworld/envs/goals/Door_60X20X20.pkl' tasks = pickle.load(open(tasksFile, 'rb')) baseEnv = SawyerDoorOpenEnv(tasks=tasks) env = FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=['state_observation'])) env = TfEnv(NormalizedBoxEnv(env)) policy = MAMLGaussianMLPPolicy( name="policy", env_spec=env.spec, grad_step_size=fast_learning_rate, hidden_nonlinearity=tf.nn.relu, hidden_sizes=variant['hidden_sizes'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = MAMLTRPO( env=env, policy=policy, baseline=baseline, batch_size=fast_batch_size, # number of trajs for grad update max_path_length=max_path_length, meta_batch_size=meta_batch_size, num_grad_updates=num_grad_updates, n_itr=1000, use_maml=True, step_size=meta_step_size, plot=False, ) # import os # saveDir = variant['saveDir'] # if os.path.isdir(saveDir)==False: # os.mkdir(saveDir) # logger.set_snapshot_dir(saveDir) # #logger.set_snapshot_gap(20) # logger.add_tabular_output(saveDir+'progress.csv') algo.train()
def experiment(variant): seed = variant['seed'] n_parallel = variant['n_parallel'] log_dir = variant['log_dir'] setup(seed, n_parallel, log_dir) expertDataLoc = variant['expertDataLoc'] expertDataItr = variant['expertDataItr'] fast_learning_rate = variant['flr'] fast_batch_size = variant[ 'fbs'] # 10 works for [0.1, 0.2], 20 doesn't improve much for [0,0.2] meta_batch_size = 20 # 10 also works, but much less stable, 20 is fairly stable, 40 is more stable max_path_length = 150 num_grad_updates = 1 meta_step_size = variant['mlr'] regionSize = variant['regionSize'] if regionSize == '20X20': tasksFile = '/root/code/multiworld/multiworld/envs/goals/pickPlace_20X20_v1.pkl' else: assert regionSize == '60X30' tasksFile = '/root/code/multiworld/multiworld/envs/goals/PickPlace_60X30.pkl' tasks = pickle.load(open(tasksFile, 'rb')) envType = variant['envType'] if envType == 'Push': baseEnv = SawyerPushEnv(tasks=tasks) else: assert (envType) == 'PickPlace' baseEnv = SawyerPickPlaceEnv(tasks=tasks) env = FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=['state_observation'])) env = TfEnv(NormalizedBoxEnv(env)) policy = MAMLGaussianMLPPolicy( name="policy", env_spec=env.spec, grad_step_size=fast_learning_rate, hidden_nonlinearity=tf.nn.relu, hidden_sizes=variant['hidden_sizes'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = MAMLTRPO( env=env, policy=policy, baseline=baseline, batch_size=fast_batch_size, # number of trajs for grad update max_path_length=max_path_length, meta_batch_size=meta_batch_size, num_grad_updates=num_grad_updates, n_itr=1000, use_maml=True, step_size=meta_step_size, plot=False, numExpertPolicies=20, expertDataInfo={ 'expert_loc': expertDataLoc, 'expert_itr': expertDataItr }) algo.train()
def experiment(variant, comet_exp_key=None): comet_logger = None if comet_exp_key is not None: # from rllab.misc.comet_logger import CometContinuedLogger, CometLogger # from comet_ml import Experiment, ExistingExperiment # comet_log = CometContinuedLogger(api_key="KWwx7zh6I2uw6oQMkpEo3smu0", previous_experiment_key=variant['comet_exp_key']) comet_logger = ExistingExperiment( api_key="KWwx7zh6I2uw6oQMkpEo3smu0", previous_experiment=variant['comet_exp_key']) # comet_log = CometLogger(api_key="KWwx7zh6I2uw6oQMkpEo3smu0", # project_name="ml4l3", workspace="glenb") comet_logger.set_name("test seq train") # comet_log = comet_exp_key print("RL!: ", comet_logger) print("%%%%%%%%%%%%%%%%%", comet_logger) seed = variant['seed'] log_dir = variant['log_dir'] n_parallel = variant['n_parallel'] setup(seed, n_parallel, log_dir) init_file = variant['init_file'] taskIndex = variant['taskIndex'] n_itr = variant['n_itr'] default_step = variant['default_step'] policyType = variant['policyType'] envType = variant['envType'] tasksFile = path_to_multiworld + '/multiworld/envs/goals/' + variant[ 'tasksFile'] + '.pkl' tasks = pickle.load(open(tasksFile, 'rb')) max_path_length = variant['max_path_length'] use_images = 'conv' in policyType print("$$$$$$$$$$$$$$$ RL-TASK: ", str(tasks[taskIndex]), " $$$$$$$$$$$$$$$") if 'MultiDomain' in envType: baseEnv = Sawyer_MultiDomainEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'Push' in envType: baseEnv = SawyerPushEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'PickPlace' in envType: baseEnv = SawyerPickPlaceEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'Door' in envType: baseEnv = SawyerDoorOpenEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'Ant' in envType: env = TfEnv(normalize(AntEnvRandGoalRing())) elif 'Coffee' in envType: baseEnv = SawyerCoffeeEnv(mpl=max_path_length) else: raise AssertionError('') if envType in ['Push', 'PickPlace', 'Door']: if use_images: obs_keys = ['img_observation'] else: obs_keys = ['state_observation'] env = TfEnv( NormalizedBoxEnv( FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=obs_keys), reset_mode='idx'))) baseline = ZeroBaseline(env_spec=env.spec) # baseline = LinearFeatureBaseline(env_spec = env.spec) batch_size = variant['batch_size'] if policyType == 'fullAda_Bias': baseline = LinearFeatureBaseline(env_spec=env.spec) algo = vpg_fullADA( env=env, policy=None, load_policy=init_file, baseline=baseline, batch_size=batch_size, # 2x max_path_length=max_path_length, n_itr=n_itr, # noise_opt = True, default_step=default_step, sampler_cls=VectorizedSampler, # added by RK 6/19 sampler_args=dict(n_envs=1), # reset_arg=np.asscalar(taskIndex), reset_arg=taskIndex, log_dir=log_dir, comet_logger=comet_logger, outer_iteration=variant['outer_iteration']) elif policyType == 'biasAda_Bias': algo = vpg_biasADA( env=env, policy=None, load_policy=init_file, baseline=baseline, batch_size=batch_size, # 2x max_path_length=max_path_length, n_itr=n_itr, # noise_opt = True, default_step=default_step, sampler_cls=VectorizedSampler, # added by RK 6/19 sampler_args=dict(n_envs=1), # reset_arg=np.asscalar(taskIndex), reset_arg=taskIndex, log_dir=log_dir) elif policyType == 'basic': algo = vpg_basic( env=env, policy=None, load_policy=init_file, baseline=baseline, batch_size=batch_size, max_path_length=max_path_length, n_itr=n_itr, # step_size=10.0, sampler_cls=VectorizedSampler, # added by RK 6/19 sampler_args=dict(n_envs=1), reset_arg=taskIndex, optimizer=None, optimizer_args={ 'init_learning_rate': default_step, 'tf_optimizer_args': { 'learning_rate': 0.5 * default_step }, 'tf_optimizer_cls': tf.train.GradientDescentOptimizer }, log_dir=log_dir # extra_input="onehot_exploration", # added by RK 6/19 # extra_input_dim=5, # added by RK 6/19 ) elif 'conv' in policyType: algo = vpg_conv( env=env, policy=None, load_policy=init_file, baseline=baseline, batch_size=batch_size, # 2x max_path_length=max_path_length, n_itr=n_itr, sampler_cls=VectorizedSampler, # added by RK 6/19 sampler_args=dict(n_envs=1), # noise_opt = True, default_step=default_step, # reset_arg=np.asscalar(taskIndex), reset_arg=taskIndex, log_dir=log_dir) else: raise AssertionError( 'Policy Type must be fullAda_Bias or biasAda_Bias') algo.train()
def experiment(variant): seed = variant['seed'] n_parallel = 1 log_dir = variant['log_dir'] setup(seed, n_parallel, log_dir) fast_batch_size = variant['fbs'] meta_batch_size = variant['mbs'] adam_steps = variant['adam_steps'] max_path_length = variant['max_path_length'] dagger = variant['dagger'] expert_policy_loc = variant['expert_policy_loc'] ldim = variant['ldim'] init_flr = variant['init_flr'] policyType = variant['policyType'] use_maesn = variant['use_maesn'] EXPERT_TRAJ_LOCATION = variant['expertDataLoc'] envType = variant['envType'] tasksFile = path_to_multiworld + 'multiworld/envs/goals/' + variant[ 'tasksFile'] + '.pkl' all_tasks = pickle.load(open(tasksFile, 'rb')) assert meta_batch_size <= len(all_tasks) tasks = all_tasks[:meta_batch_size] use_images = 'conv' in policyType if 'Push' == envType: baseEnv = SawyerPushEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif envType == 'sparsePush': baseEnv = SawyerPushEnv(tasks=tasks, image=use_images, mpl=max_path_length, rewMode='l2Sparse') elif 'PickPlace' in envType: baseEnv = SawyerPickPlaceEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'Door' in envType: baseEnv = SawyerDoorOpenEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'Ant' in envType: env = TfEnv(normalize(AntEnvRandGoalRing())) elif 'claw' in envType: env = TfEnv(DClawScrewRandGoal()) else: assert True == False if envType in ['Push', 'PickPlace', 'Door']: if use_images: obs_keys = ['img_observation'] else: obs_keys = ['state_observation'] env = TfEnv( NormalizedBoxEnv( FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=obs_keys), reset_mode='idx'))) algoClass = MAMLIL baseline = LinearFeatureBaseline(env_spec=env.spec) load_policy = variant['load_policy'] if load_policy != None: policy = None load_policy = variant['load_policy'] # if 'conv' in load_policy: # baseline = ZeroBaseline(env_spec=env.spec) elif 'fullAda_Bias' in policyType: policy = fullAda_Bias_policy(name="policy", env_spec=env.spec, grad_step_size=init_flr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), init_flr_full=init_flr, latent_dim=ldim) elif 'biasAda_Bias' in policyType: policy = biasAda_Bias_policy(name="policy", env_spec=env.spec, grad_step_size=init_flr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), init_flr_full=init_flr, latent_dim=ldim) elif 'basic' in policyType: policy = basic_policy( name="policy", env_spec=env.spec, grad_step_size=init_flr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), extra_input_dim=(0 if extra_input is "" else extra_input_dim), ) elif 'conv' in policyType: baseline = ZeroBaseline(env_spec=env.spec) policy = conv_policy( name="policy", latent_dim=ldim, policyType=policyType, env_spec=env.spec, init_flr=init_flr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), extra_input_dim=(0 if extra_input is "" else extra_input_dim), ) algo = algoClass( env=env, policy=policy, load_policy=load_policy, baseline=baseline, batch_size=fast_batch_size, # number of trajs for alpha grad update max_path_length=max_path_length, meta_batch_size= meta_batch_size, # number of tasks sampled for beta grad update num_grad_updates=num_grad_updates, # number of alpha grad updates n_itr=1, #100 make_video=False, use_maml=True, use_pooled_goals=True, use_corr_term=use_corr_term, test_on_training_goals=test_on_training_goals, metalearn_baseline=False, # metalearn_baseline=False, limit_demos_num=limit_demos_num, test_goals_mult=1, step_size=meta_step_size, plot=False, beta_steps=beta_steps, adam_curve=None, adam_steps=adam_steps, pre_std_modifier=pre_std_modifier, l2loss_std_mult=l2loss_std_mult, importance_sampling_modifier=MOD_FUNC[''], post_std_modifier=post_std_modifier, expert_trajs_dir=EXPERT_TRAJ_LOCATION, expert_trajs_suffix='', seed=seed, extra_input=extra_input, extra_input_dim=(0 if extra_input is "" else extra_input_dim), plotDirPrefix=None, latent_dim=ldim, dagger=dagger, expert_policy_loc=expert_policy_loc) algo.train()
num_imSteps = 50 use_maml = True ratio = '_5_1' expertDataLoc = '/home/russellm/mri_onPolicy/expertPolicyWeights/TRPO-push-20X20-v1/' expertDataItr = 300 for meta_batch_size in meta_batch_sizes: for fast_learning_rate in fast_learning_rates: for fast_batch_size in fast_batch_sizes: stub(globals()) baseEnv = SawyerPushEnv(tasks=None) env = FinnMamlEnv( FlatGoalEnv(baseEnv, obs_keys=['state_observation'])) env = TfEnv(NormalizedBoxEnv(env)) policy = MAMLGaussianMLPPolicy( name="policy", env_spec=env.spec, grad_step_size=fast_learning_rate, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = MAMLTRPO( env=env, policy=policy,
def action_space(self): return FlatGoalEnv.action_space(self)
parser.add_argument('--use_tensorboard', action='store_true') parser.add_argument('--logdir', type=str, default='./logs/ddpg_test') parser.add_argument('--exp_name', type=str, default='evaluate') parser.add_argument('--env', type=str, default='SawyerReachXYEnv-v1') args = parser.parse_args() _, get_action = load_policy(args.saved_model, args.itr if args.itr >= 0 else 'last', args.deterministic) tensor_board = None env = SawyerReachXYZEnv( action_mode='position', position_action_scale=0.1, config_name='austri_config', reset_free=False, max_speed=0.05, fix_goal=False, fixed_goal=(0.53,0.0,0.15) ) env = FlatGoalEnv(env, append_goal_to_obs=True) env.reset() logdir_ext = os.path.join(args.logdir + '_' + args.env + '_evaluate') if not os.path.exists(logdir_ext): os.mkdir(logdir_ext) if args.use_tensorboard: tensor_board = SummaryWriter(logdir_ext) logger_kwargs = setup_logger_kwargs(exp_name=args.exp_name, data_dir=logdir_ext) run_policy(env, get_action, args.len, args.episodes, args.render, tensor_board)