def simulate_policy(args): with tf.Session() as sess: data = joblib.load(args.file) if 'algo' in data.keys(): policy = data['algo'].policy # env = data['algo'].env else: policy = data['policy'] # env = data['env'] render = True random_arm_init = [0.0, 0.0] reward_shaping = True horizon = 1000 env = normalize( CRLWrapper( # IKWrapper( SawyerReach( # playable params random_arm_init=random_arm_init, seed=True, has_renderer=render, reward_shaping=reward_shaping, horizon=horizon, # constant params has_offscreen_renderer=False, use_camera_obs=False, use_object_obs=True, control_freq=100, ) # ) ) ) mean_reward = [] # while True: with policy.deterministic(args.deterministic): for _ in range(args.trials): path = rollout(env, policy, max_path_length=horizon - 1, animated=True, speedup=1.5, always_return_paths=True) mean_reward.append(np.sum(path["rewards"])) print("Average Return {}+/-{}".format(np.mean(mean_reward), np.std(mean_reward)))
def run_experiment(param): random_arm_init = [-0.1, 0.1] lower_goal_range = [-0.1, -0.1, -0.1] upper_goal_range = [0.1, 0.1, 0.1] render = False reward_shaping = True horizon = 250 env = normalize( CRLWrapper( # IKWrapper( SawyerReach( # playable params random_arm_init=random_arm_init, lower_goal_range=lower_goal_range, upper_goal_range=upper_goal_range, has_renderer=render, reward_shaping=reward_shaping, horizon=horizon, # constant params has_offscreen_renderer=False, use_camera_obs=False, use_object_obs=True, control_freq=100, ) # ) ) ) replay_buffer_params = { 'max_replay_buffer_size': 1e6, } sampler_params = { 'max_path_length': horizon - 1, 'min_pool_size': 1000, 'batch_size': 256, } pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params) sampler = SimpleSampler(**sampler_params) base_kwargs = dict( { 'epoch_length': 1500, 'n_train_repeat': 1, 'n_initial_exploration_steps': 5000, 'eval_render': False, 'eval_n_episodes': 1, 'eval_deterministic': True, 'n_epochs': 2e3 }, sampler=sampler) M = 64 qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1') qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2') vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M)) initial_exploration_policy = UniformPolicy(env_spec=env.spec) policy = GaussianPolicy( env_spec=env.spec, hidden_layer_sizes=(64, 64), reparameterize=True, reg=1e-3, ) algorithm = SAC( base_kwargs=base_kwargs, env=env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=pool, qf1=qf1, qf2=qf2, vf=vf, lr=3e-4, scale_reward=20, discount=0.99, tau=0.005, reparameterize=True, target_update_interval=1, action_prior='uniform', save_full_state=False, ) algorithm._sess.run(tf.global_variables_initializer()) algorithm.train()
def run_experiment(variant): sub_level_policies_paths = [] # args = parse_args() args = arg() if args.domain == 'sawyer-reach': goal_size = 0 sub_level_policies_paths.append("ikx") sub_level_policies_paths.append("iky") sub_level_policies_paths.append("ikz") random_arm_init = [-0.1, 0.1] render = False reward_shaping = True horizon = 250 env = normalize( CRLWrapper( IKWrapper( SawyerReach( # playable params random_arm_init=random_arm_init, has_renderer=render, reward_shaping=reward_shaping, horizon=horizon, # constant params has_offscreen_renderer=False, use_camera_obs=False, use_object_obs=True, control_freq=100, )))) else: raise ValueError("Domain not available") pool = SimpleReplayBuffer( env_spec=env.spec, max_replay_buffer_size=1e6, seq_len=len(sub_level_policies_paths), ) sampler = SimpleSampler( max_path_length=horizon - 1, # should be same as horizon min_pool_size=1000, batch_size=256) base_kwargs = dict( epoch_length=1000, n_epochs=2e3, # n_epochs=5, n_train_repeat=1, eval_render=False, eval_n_episodes=1, eval_deterministic=True, sampler=sampler) M = 128 qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1') qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2') vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M)) initial_exploration_policy = UniformPolicy(env_spec=env.spec) policy = GaussianPtrPolicy( env_spec=env.spec, hidden_layer_sizes=(M, M), reparameterize=True, reg=1e-3, ) algorithm = SAC( base_kwargs=base_kwargs, env=env, g=goal_size, policy=policy, sub_level_policies_paths=sub_level_policies_paths, initial_exploration_policy=initial_exploration_policy, pool=pool, qf1=qf1, qf2=qf2, vf=vf, lr=3e-4, scale_reward=5, discount=0.99, tau=0.005, reparameterize=True, target_update_interval=1, action_prior='uniform', save_full_state=False, ) algorithm._sess.run(tf.global_variables_initializer()) algorithm.train()
def simulate_policy(args): paths = [] sub_level_policies = [] sub_level_policies_paths = [] render = True if args.domain == 'sawyer-reach': print("Composition Reach") goal_size = 0 sub_level_policies_paths.append("ikx") sub_level_policies_paths.append("iky") sub_level_policies_paths.append("ikz") random_arm_init = [-0.1, 0.1] reward_shaping = True horizon = 250 env = normalize( CRLWrapper( IKWrapper( SawyerReach( # playable params random_arm_init=random_arm_init, has_renderer=render, reward_shaping=reward_shaping, horizon=horizon, # constant params has_offscreen_renderer=False, use_camera_obs=False, use_object_obs=True, control_freq=100, )))) elif args.domain == 'sawyer-reach-pick': print("Composition Reach and Pick") goal_size = 3 sub_level_policies_paths.append( "log/prim/pick/2019-08-14-18-18-17-370041-PDT/itr_2000.pkl") sub_level_policies_paths.append( "log/prim/reach/2019-08-20-15-52-39-191438-PDT/itr_2000.pkl") random_arm_init = [-0.0001, 0.0001] reward_shaping = False horizon = 1000 env = normalize( CRLWrapper( SawyerReachPick( # playable params random_arm_init=random_arm_init, has_renderer=render, reward_shaping=reward_shaping, horizon=horizon, # constant params has_offscreen_renderer=False, use_camera_obs=False, use_object_obs=True, control_freq=100, ))) elif args.domain == 'sawyer-reach-pick-simple': print("Composition Reach and Pick Simple") goal_size = 3 sub_level_policies_paths.append( "log/prim/pick/2019-08-14-18-18-17-370041-PDT/itr_2000.pkl") sub_level_policies_paths.append( "log/prim/reach/2019-08-20-15-52-39-191438-PDT/itr_2000.pkl") render = True random_arm_init = [-0.0001, 0.0001] reward_shaping = False horizon = 500 env = normalize( CRLWrapper( SawyerReachPick( # playable params random_arm_init=random_arm_init, has_renderer=render, reward_shaping=reward_shaping, horizon=horizon, placement_initializer=UniformRandomSampler( x_range=[-0.02, 0.02], y_range=[-0.02, 0.02], ensure_object_boundary_in_range=False, z_rotation=None, ), # constant params has_offscreen_renderer=False, use_camera_obs=False, use_object_obs=True, control_freq=100, ))) else: print("NO DOMAIN") with tf.Session() as sess: for p in range(0, len(sub_level_policies_paths)): path = sub_level_policies_paths[p] if path[:2] == 'ik': with tf.variable_scope(str(p), reuse=False): policy_snapshot = IK_Policy(path[2]) sub_level_policies.append(policy_snapshot) else: with tf.variable_scope(str(p), reuse=False): policy_snapshot = joblib.load(sub_level_policies_paths[p]) sub_level_policies.append(policy_snapshot["policy"]) data = joblib.load(args.file) if 'algo' in data.keys(): policy = data['algo'].policy # env = data['algo'].env else: policy = data['policy'] # env = data['env'] mean_reward = [] with policy.deterministic(args.deterministic): Npath = 0 while Npath < args.trials: path = rollout(env, policy, sub_level_policies, path_length=horizon - 1, g=goal_size) # if np.sum(path["rewards"])>=args.reward_min: # print(np.sum(path["rewards"])) # paths.append(dict( # observations= env.observation_space.flatten_n(path["observations"]), # actions= env.observation_space.flatten_n(path["actions"]), # rewards= tensor_utils.stack_tensor_list(path["rewards"]), # env_infos= path["env_infos"], # agent_infos= path["agent_infos"], # goal = path["goal"] # )) Npath += 1 mean_reward.append(np.sum(path["rewards"])) print("Average Return {}+/-{}".format(np.mean(mean_reward), np.std(mean_reward))) # fileName = osp.join(args.FilePath,'itr.pkl') # joblib.dump(paths,fileName, compress=3) return env
def run_experiment(variant): sub_level_policies_paths = [] args = arg() if args.domain == 'sawyer-reach': print("Composition Reach") goal_size = 0 sub_level_policies_paths.append("ikx") sub_level_policies_paths.append("iky") sub_level_policies_paths.append("ikz") random_arm_init = [-0.1, 0.1] render = False reward_shaping = True horizon = 250 env = normalize( CRLWrapper( IKWrapper( SawyerReach( # playable params random_arm_init=random_arm_init, has_renderer=render, reward_shaping=reward_shaping, horizon=horizon, # constant params has_offscreen_renderer=False, use_camera_obs=False, use_object_obs=True, control_freq=100, )))) ep_length = 1500 elif args.domain == 'sawyer-reach-pick': print("Composition Reach and Pick") goal_size = 3 sub_level_policies_paths.append( "log/prim/pick/2019-08-14-18-18-17-370041-PDT/itr_2000.pkl") sub_level_policies_paths.append( "log/prim/reach/2019-08-20-15-52-39-191438-PDT/itr_2000.pkl") render = False random_arm_init = [-0.0001, 0.0001] reward_shaping = False horizon = 1000 env = normalize( CRLWrapper( SawyerReachPick( # playable params random_arm_init=random_arm_init, has_renderer=render, reward_shaping=reward_shaping, horizon=horizon, # constant params has_offscreen_renderer=False, use_camera_obs=False, use_object_obs=True, control_freq=100, ))) ep_length = 1500 elif args.domain == 'sawyer-reach-pick-simple': print("Composition Reach and Pick Simple") goal_size = 3 sub_level_policies_paths.append( "log/prim/pick/2019-08-14-18-18-17-370041-PDT/itr_2000.pkl") sub_level_policies_paths.append( "log/prim/reach/2019-08-20-15-52-39-191438-PDT/itr_2000.pkl") render = False random_arm_init = [-0.0001, 0.0001] reward_shaping = False horizon = 500 env = normalize( CRLWrapper( SawyerReachPick( # playable params random_arm_init=random_arm_init, has_renderer=render, reward_shaping=reward_shaping, horizon=horizon, placement_initializer=UniformRandomSampler( x_range=[-0.01, 0.01], y_range=[-0.01, 0.01], ensure_object_boundary_in_range=False, z_rotation=None, ), # constant params has_offscreen_renderer=False, use_camera_obs=False, use_object_obs=True, control_freq=100, ))) ep_length = 3000 else: raise ValueError("Domain not available") if args.demo: pool = DemoReplayBuffer( env_spec=env.spec, max_replay_buffer_size=1e6, seq_len=len(sub_level_policies_paths), ) else: pool = SimpleReplayBuffer( env_spec=env.spec, max_replay_buffer_size=1e6, seq_len=len(sub_level_policies_paths), ) sampler = SimpleSampler( max_path_length=horizon - 1, # should be same as horizon min_pool_size=1000, batch_size=256) base_kwargs = dict( epoch_length=ep_length, n_epochs=5e3, # n_epochs=5, n_train_repeat=1, eval_render=False, eval_n_episodes=1, eval_deterministic=True, sampler=sampler, use_demos=args.demo, ) M = 128 qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1') qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2') vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M)) initial_exploration_policy = UniformPolicy(env_spec=env.spec) policy = GaussianPtrPolicy( env_spec=env.spec, hidden_layer_sizes=(M, M), reparameterize=True, reg=1e-3, ) algorithm = SAC( base_kwargs=base_kwargs, env=env, g=goal_size, policy=policy, sub_level_policies_paths=sub_level_policies_paths, initial_exploration_policy=initial_exploration_policy, pool=pool, qf1=qf1, qf2=qf2, vf=vf, lr=3e-4, scale_reward=5, discount=0.99, tau=0.005, reparameterize=True, target_update_interval=1, action_prior='uniform', save_full_state=False, ) algorithm._sess.run(tf.global_variables_initializer()) algorithm.train()
# reset env to get initial obs = initial state render = False random_arm_init = [-0.1, 0.1] reward_shaping = True horizon = 250 env = normalize( CRLWrapper( # IKWrapper( SawyerReach( # playable params random_arm_init=random_arm_init, seed=True, has_renderer=render, reward_shaping=reward_shaping, horizon=horizon, # constant params has_offscreen_renderer=False, use_camera_obs=False, use_object_obs=True, control_freq=100, ) # ) )) obs = env.reset() ik = inverse_kin(env) start = obs[0:3] points = fibonacci_sphere(0.1, 1000) x_actions = np.array([points[i][0] for i in range(len(points))]) + start[0] y_actions = np.array([points[i][1] for i in range(len(points))]) + start[1]