Esempio n. 1
0
def example():
    #initialize env
    task_gen = generate_task(task_generator_id='pushing')
    env = CausalWorld(task_gen, skip_frame=10, enable_visualization=True)

    # define a custom curriculum of interventions:

    # No intervention actor is defined until episode number 5
    # Goal intervention actor from episode number 5 to 10 after reset at time step 0
    # Visual intervention actor from episode number 10 to 20 every two episodes after reset at time step 0
    # Random intervention actor from episode number 20 to 25 after reset at time step 0
    # Goal intervention actor from episode number 25 to 30 each at time step 50

    env = CurriculumWrapper(env,
                            intervention_actors=[
                                GoalInterventionActorPolicy(),
                                VisualInterventionActorPolicy(),
                                RandomInterventionActorPolicy(),
                                GoalInterventionActorPolicy()
                            ],
                            actives=[(5, 10, 1, 0), (10, 20, 2, 0),
                                     (20, 25, 1, 0), (25, 30, 1, 50)])

    for reset_idx in range(30):
        obs = env.reset()
        for time in range(100):
            desired_action = env.action_space.sample()
            obs, reward, done, info = env.step(action=desired_action)
    env.close()
Esempio n. 2
0
def example():
    # This tutorial shows how to view policies of trained actors

    task = generate_task(task_generator_id='pick_and_place')
    world_params = dict()
    world_params["skip_frame"] = 3
    world_params["seed"] = 0
    stable_baselines_policy_path = "./model_100000000_steps.zip"
    model = PPO2.load(stable_baselines_policy_path)

    # define a method for the policy fn of your trained model
    def policy_fn(obs):
        return model.predict(obs, deterministic=True)[0]

    # Similarly for interactive visualization in the GUI
    viewer.view_policy(task=task,
                       world_params=world_params,
                       policy_fn=policy_fn,
                       max_time_steps=40 * 600,
                       number_of_resets=40,
                       env_wrappers=[CurriculumWrapper],
                       env_wrappers_args=[{
                           'intervention_actors':
                           [GoalInterventionActorPolicy()],
                           'actives': [(0, 1000000000, 1, 0)]
                       }])
Esempio n. 3
0
def train_policy(num_of_envs, log_relative_path, maximum_episode_length,
                 skip_frame, seed_num, her_config, total_time_steps,
                 validate_every_timesteps, task_name):
    task = generate_task(task_generator_id=task_name,
                         dense_reward_weights=np.array([100000, 0, 0, 0]),
                         fractional_reward_weight=0)
    env = CausalWorld(task=task,
                      skip_frame=skip_frame,
                      enable_visualization=False,
                      seed=seed_num,
                      max_episode_length=maximum_episode_length)
    env = HERGoalEnvWrapper(env)
    env = CurriculumWrapper(
        env,
        intervention_actors=[GoalInterventionActorPolicy()],
        actives=[(0, 1000000000, 1, 0)])
    set_global_seeds(seed_num)
    checkpoint_callback = CheckpointCallback(save_freq=int(
        validate_every_timesteps / num_of_envs),
                                             save_path=log_relative_path,
                                             name_prefix='model')
    model = HER(MlpPolicy,
                env,
                SAC,
                verbose=1,
                policy_kwargs=dict(layers=[256, 256, 256]),
                **her_config,
                seed=seed_num)
    model.learn(total_timesteps=total_time_steps,
                tb_log_name="her_sac",
                callback=checkpoint_callback)
    return
Esempio n. 4
0
def simulate_policy():
    file = './her-sac-fetch-experiment/her-sac-fetch-experiment_2020_07_07_11_11_14_0000--s-0/params.pkl'
    data = torch.load(file)
    policy = data['evaluation/policy']
    policy.reset()

    def policy_func(obs):
        # new_obs = np.hstack((obs['observation'], obs['desired_goal']))
        a, agent_info = policy.get_action(obs)
        return a

    task = generate_task(task_generator_id='reaching')
    env = CausalWorld(task=task,
                      enable_visualization=True,
                      skip_frame=1,
                      seed=0,
                      max_episode_length=2500)
    env = CurriculumWrapper(env,
                            intervention_actors=[GoalInterventionActorPolicy()],
                            actives=[(0, 1000000000, 1, 0)])
    # env = HERGoalEnvWrapper(env)

    for _ in range(100):
        total_reward = 0
        o = env.reset()
        for _ in range(2500):
            o, reward, done, info = env.step(policy_func(o))
            total_reward += reward
        print("total reward is :", total_reward)
    env.close()
def initialize_intervention_actors(actors_params):
    """

    :param actors_params:
    :return:
    """
    intervention_actors_list = []
    for actor_param in actors_params:
        if actor_param == 'random_actor':
            intervention_actors_list.append(
                RandomInterventionActorPolicy(**actors_params[actor_param]))
        elif actor_param == 'goal_actor':
            intervention_actors_list.append(
                GoalInterventionActorPolicy(**actors_params[actor_param]))
        elif actor_param == 'joints_actor':
            intervention_actors_list.append(
                JointsInterventionActorPolicy(**actors_params[actor_param]))
        elif actor_param == 'physical_properties_actor':
            intervention_actors_list.append(
                PhysicalPropertiesInterventionActorPolicy(
                    **actors_params[actor_param]))
        elif actor_param == 'rigid_pose_actor':
            intervention_actors_list.append(
                RigidPoseInterventionActorPolicy(**actors_params[actor_param]))
        elif actor_param == 'visual_actor':
            intervention_actors_list.append(
                VisualInterventionActorPolicy(**actors_params[actor_param]))
        else:
            raise Exception("The intervention actor {} can't be loaded".format(
                actor_param))
    return intervention_actors_list
Esempio n. 6
0
def _make_env(rank):
    task = generate_task('pushing',
                         dense_reward_weights=np.array([2500, 2500, 0]),
                         variables_space='space_a',
                         fractional_reward_weight=100)
    env = CausalWorld(task=task,
                      skip_frame=3,
                      enable_visualization=False,
                      seed=0 + rank)
    env = CurriculumWrapper(
        env,
        intervention_actors=[GoalInterventionActorPolicy()],
        actives=(0, 1e9, 2, 0))
    env = GymEnvWrapper(env)
    return env
Esempio n. 7
0
def example():
    #initialize env
    task = generate_task(task_generator_id='pick_and_place')
    env = CausalWorld(task, skip_frame=10, enable_visualization=True)

    # define a custom curriculum of interventions:
    # Goal intervention actor each episode after reset

    env = CurriculumWrapper(
        env,
        intervention_actors=[GoalInterventionActorPolicy()],
        actives=[(0, 1000000000, 1, 0)])

    for reset_idx in range(30):
        obs = env.reset()
        for time in range(300):
            obs, reward, done, info = env.step(env.action_space.low)
    env.close()
def baseline_model(model_num, task):
    if task == 'pushing':
        benchmarks = utils.sweep('benchmarks', [PUSHING_BENCHMARK])
        task_configs = [{
            'task_configs': {
                'variables_space': 'space_a',
                'fractional_reward_weight': 1,
                'dense_reward_weights': [750, 250, 0]
            }
        }]
    elif task == 'picking':
        benchmarks = utils.sweep('benchmarks', [PICKING_BENCHMARK])
        task_configs = [{
            'task_configs': {
                'variables_space': 'space_a',
                'fractional_reward_weight': 1,
                'dense_reward_weights': [250, 0, 125, 0, 750, 0, 0, 0.005]
            }
        }]
    elif task == 'pick_and_place':
        benchmarks = utils.sweep('benchmarks', [PICK_AND_PLACE_BENCHMARK])
        task_configs = [{
            'task_configs': {
                'variables_space': 'space_a',
                'fractional_reward_weight': 1,
                'dense_reward_weights': [750, 50, 250, 0, 0.005]
            }
        }]
    elif task == 'stacking2':
        benchmarks = utils.sweep('benchmarks', [STACKING2_BENCHMARK])
        task_configs = [{
            'task_configs': {
                'variables_space': 'space_a',
                'fractional_reward_weight': 1,
                'dense_reward_weights': [750, 250, 250, 125, 0.005]
            }
        }]
    else:
        benchmarks = utils.sweep('benchmarks', [PUSHING_BENCHMARK])
        task_configs = [{
            'task_configs': {
                'variables_space': 'space_a',
                'fractional_reward_weight': 1,
                'dense_reward_weights': [750, 250, 0]
            }
        }]

    world_params = [{
        'world_params': {
            'skip_frame': 3,
            'enable_visualization': False,
            'observation_mode': 'structured',
            'normalize_observations': True,
            'action_mode': 'joint_positions'
        }
    }]

    net_layers = utils.sweep('NET_LAYERS', [[256, 256]])
    world_seed = utils.sweep('world_seed', [0])
    NUM_RANDOM_SEEDS = 5
    random_seeds = utils.sweep('seed', list(range(NUM_RANDOM_SEEDS)))

    ppo = {
        'num_of_envs': 20,
        'algorithm': 'PPO',
        'validate_every_timesteps': int(2000000),
        'total_time_steps': int(100000000),
        'train_configs': {
            "gamma": 0.99,
            "n_steps": int(120000 / 20),
            "ent_coef": 0.01,
            "learning_rate": 0.00025,
            "vf_coef": 0.5,
            "max_grad_norm": 0.5,
            "nminibatches": 40,
            "noptepochs": 4
        }
    }

    sac = {
        'num_of_envs': 1,
        'algorithm': 'SAC',
        'validate_every_timesteps': int(500000),
        'total_time_steps': int(10000000),
        'train_configs': {
            "gamma": 0.95,
            "tau": 1e-3,
            "ent_coef": 1e-3,
            "target_entropy": 'auto',
            "learning_rate": 1e-4,
            "buffer_size": 1000000,
            "learning_starts": 1000,
            "batch_size": 256
        }
    }

    td3 = {
        'num_of_envs': 1,
        'algorithm': 'TD3',
        'validate_every_timesteps': int(500000),
        'total_time_steps': int(10000000),
        'train_configs': {
            "gamma": 0.96,
            "tau": 0.02,
            "learning_rate": 1e-4,
            "buffer_size": 500000,
            "learning_starts": 1000,
            "batch_size": 128
        }
    }

    algorithms = [ppo, sac, td3]

    curriculum_kwargs_1 = {'intervention_actors': [], 'actives': []}
    curriculum_kwargs_2 = {
        'intervention_actors': [GoalInterventionActorPolicy()],
        'actives': [(0, 1e9, 1, 0)]
    }
    curriculum_kwargs_3 = {
        'intervention_actors': [RandomInterventionActorPolicy()],
        'actives': [(0, 1e9, 1, 0)]
    }
    curriculum_kwargs = [
        curriculum_kwargs_1, curriculum_kwargs_2, curriculum_kwargs_3
    ]

    return utils.outer_product([
        benchmarks, world_params, task_configs, algorithms, curriculum_kwargs,
        random_seeds, world_seed, net_layers
    ])[model_num]