def experiment(variant):
    env = gym.make('SawyerReachXYZEnv-v0')
    es = GaussianAndEpislonStrategy(
        action_space=env.action_space,
        max_sigma=.2,
        min_sigma=.2,  # constant sigma
        epsilon=.3,
    )
    obs_dim = env.observation_space.spaces['observation'].low.size
    goal_dim = env.observation_space.spaces['desired_goal'].low.size
    action_dim = env.action_space.low.size
    qf1 = FlattenMlp(
        input_size=obs_dim + goal_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + goal_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim + goal_dim,
        output_size=action_dim,
        hidden_sizes=[400, 300],
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    replay_buffer = ObsDictRelabelingBuffer(
        env=env,
        achieved_goal_key='state_achieved_goal',
        desired_goal_key='state_desired_goal',
        **variant['replay_buffer_kwargs']
    )
    algorithm = HerTd3(
        replay_buffer=replay_buffer,
        her_kwargs=dict(
            observation_key='observation',
            desired_goal_key='desired_goal'
        ),
        td3_kwargs = dict(
            env=env,
            qf1=qf1,
            qf2=qf2,
            policy=policy,
            exploration_policy=exploration_policy
        ),
        **variant['algo_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Beispiel #2
0
def experiment(variant):
    env = gym.make('replab-v0')._start_rospy(goal_oriented=True)
    #SIM
    #env = gym.make('replab-v0')._start_sim(goal_oriented=True, render=False)
    env = NormalizedBoxEnv(env)
    es = GaussianAndEpislonStrategy(
        action_space=env.action_space,
        max_sigma=.2,
        min_sigma=.2,  # constant sigma
        epsilon=.3,
    )
    obs_dim = env.observation_space.spaces['observation'].low.size
    goal_dim = env.observation_space.spaces['desired_goal'].low.size
    action_dim = env.action_space.low.size
    qf1 = FlattenMlp(
        input_size=obs_dim + goal_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + goal_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim + goal_dim,
        output_size=action_dim,
        hidden_sizes=[400, 300],
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    replay_buffer = ObsDictRelabelingBuffer(env=env,
                                            **variant['replay_buffer_kwargs'])
    algorithm = HerTd3(her_kwargs=dict(observation_key='observation',
                                       desired_goal_key='desired_goal'),
                       td3_kwargs=dict(env=env,
                                       qf1=qf1,
                                       qf2=qf2,
                                       policy=policy,
                                       exploration_policy=exploration_policy),
                       replay_buffer=replay_buffer,
                       **variant['algo_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Beispiel #3
0
def experiment(variant):
    import multiworld
    multiworld.register_all_envs()
    eval_env = gym.make('SawyerPushXYZEnv-v0')
    expl_env = gym.make('SawyerPushXYZEnv-v0')
    observation_key = 'state_observation'
    desired_goal_key = 'state_desired_goal'
    achieved_goal_key = desired_goal_key.replace("desired", "achieved")
    es = GaussianAndEpislonStrategy(
        action_space=expl_env.action_space,
        max_sigma=.2,
        min_sigma=.2,  # constant sigma
        epsilon=.3,
    )
    obs_dim = expl_env.observation_space.spaces['observation'].low.size
    goal_dim = expl_env.observation_space.spaces['desired_goal'].low.size
    action_dim = expl_env.action_space.low.size
    qf1 = FlattenMlp(
        input_size=obs_dim + goal_dim + action_dim,
        output_size=1,
        **variant['qf_kwargs']
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + goal_dim + action_dim,
        output_size=1,
        **variant['qf_kwargs']
    )
    target_qf1 = FlattenMlp(
        input_size=obs_dim + goal_dim + action_dim,
        output_size=1,
        **variant['qf_kwargs']
    )
    target_qf2 = FlattenMlp(
        input_size=obs_dim + goal_dim + action_dim,
        output_size=1,
        **variant['qf_kwargs']
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim + goal_dim,
        output_size=action_dim,
        **variant['policy_kwargs']
    )
    target_policy = TanhMlpPolicy(
        input_size=obs_dim + goal_dim,
        output_size=action_dim,
        **variant['policy_kwargs']
    )
    expl_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    replay_buffer = ObsDictRelabelingBuffer(
        env=eval_env,
        observation_key=observation_key,
        desired_goal_key=desired_goal_key,
        achieved_goal_key=achieved_goal_key,
        **variant['replay_buffer_kwargs']
    )
    trainer = TD3Trainer(
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        target_qf1=target_qf1,
        target_qf2=target_qf2,
        target_policy=target_policy,
        **variant['trainer_kwargs']
    )
    trainer = HERTrainer(trainer)
    eval_path_collector = GoalConditionedPathCollector(
        eval_env,
        policy,
        observation_key=observation_key,
        desired_goal_key=desired_goal_key,
    )
    expl_path_collector = GoalConditionedPathCollector(
        expl_env,
        expl_policy,
        observation_key=observation_key,
        desired_goal_key=desired_goal_key,
    )
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algo_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
def experiment(variant):
    img_size = 64
    train_top10 = VisualRandomizationConfig(
        image_directory='./experiment_textures/train/top10',
        whitelist=[
            'Floor', 'Roof', 'Wall1', 'Wall2', 'Wall3', 'Wall4',
            'diningTable_visible'
        ],
        apply_arm=False,
        apply_gripper=False,
        apply_floor=True)
    expl_env = gym.make('reach_target_easy-vision-v0',
                        sparse=False,
                        img_size=img_size,
                        force_randomly_place=True,
                        force_change_position=False,
                        blank=True)
    expl_env = wrappers.FlattenDictWrapper(expl_env, dict_keys=['observation'])
    t_fn = variant["t_fn"]
    expl_env = TransformObservationWrapper(expl_env, t_fn)
    obs_dim = expl_env.observation_space.low.size
    action_dim = expl_env.action_space.low.size
    conv_args = {
        "input_width": 64,
        "input_height": 64,
        "input_channels": 3,
        "kernel_sizes": [4, 4, 3],
        "n_channels": [32, 64, 64],
        "strides": [2, 1, 1],
        "paddings": [0, 0, 0],
        "hidden_sizes": [1024, 512],
        "batch_norm_conv": False,
        "batch_norm_fc": False,
        'init_w': 1e-4,
        "hidden_init": nn.init.orthogonal_,
        "hidden_activation": nn.ReLU(),
    }

    qf1 = FlattenCNN(output_size=1,
                     added_fc_input_size=action_dim,
                     **variant['qf_kwargs'],
                     **conv_args)
    qf2 = FlattenCNN(output_size=1,
                     added_fc_input_size=action_dim,
                     **variant['qf_kwargs'],
                     **conv_args)
    target_qf1 = FlattenCNN(output_size=1,
                            added_fc_input_size=action_dim,
                            **variant['qf_kwargs'],
                            **conv_args)
    target_qf2 = FlattenCNN(output_size=1,
                            added_fc_input_size=action_dim,
                            **variant['qf_kwargs'],
                            **conv_args)
    policy = TanhCNNPolicy(output_size=action_dim,
                           **variant['policy_kwargs'],
                           **conv_args)
    target_policy = TanhCNNPolicy(output_size=action_dim,
                                  **variant['policy_kwargs'],
                                  **conv_args)
    # es = GaussianStrategy(
    #     action_space=expl_env.action_space,
    #     max_sigma=0.3,
    #     min_sigma=0.1,  # Constant sigma
    # )

    es = GaussianAndEpislonStrategy(
        action_space=expl_env.action_space,
        epsilon=0.3,
        max_sigma=0.0,
        min_sigma=0.0,  #constant sigma 0
        decay_period=1000000)

    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )

    expl_path_collector = MdpPathCollector(
        expl_env,
        exploration_policy,
    )
    replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env)
    trainer = TD3Trainer(policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         target_policy=target_policy,
                         **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=None,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=None,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
def experiment(variant):
    expl_env = envs[variant['env']](variant['dr'])
    expl_env = wrappers.FlattenDictWrapper(expl_env, dict_keys=['observation'])
    t_fn = variant["t_fn"]
    expl_env = TransformObservationWrapper(expl_env, t_fn)
    action_dim = expl_env.action_space.low.size
    conv_args = {
        "input_width": 16,
        "input_height": 16,
        "input_channels": 8,
        "kernel_sizes": [4],
        "n_channels": [32],
        "strides": [4],
        "paddings": [0],
        "hidden_sizes": [1024, 512],
        "batch_norm_conv": False,
        "batch_norm_fc": False,
        'init_w': 1e-4,
        "hidden_init": nn.init.orthogonal_,
        "hidden_activation": nn.ReLU(),
    }

    qf1 = FlattenCNN(output_size=1,
                     added_fc_input_size=action_dim,
                     **variant['qf_kwargs'],
                     **conv_args)
    qf2 = FlattenCNN(output_size=1,
                     added_fc_input_size=action_dim,
                     **variant['qf_kwargs'],
                     **conv_args)
    target_qf1 = FlattenCNN(output_size=1,
                            added_fc_input_size=action_dim,
                            **variant['qf_kwargs'],
                            **conv_args)
    target_qf2 = FlattenCNN(output_size=1,
                            added_fc_input_size=action_dim,
                            **variant['qf_kwargs'],
                            **conv_args)
    policy = TanhCNNPolicy(output_size=action_dim,
                           **variant['policy_kwargs'],
                           **conv_args)
    target_policy = TanhCNNPolicy(output_size=action_dim,
                                  **variant['policy_kwargs'],
                                  **conv_args)
    if variant['noise'] == "eps":
        es = GaussianAndEpislonStrategy(
            action_space=expl_env.action_space,
            epsilon=0.3,
            max_sigma=0.0,
            min_sigma=0.0,  #constant sigma 0
            decay_period=1000000)
    elif variant['noise'] == "gaussian":
        es = GaussianStrategy(action_space=expl_env.action_space,
                              max_sigma=0.3,
                              min_sigma=0.1,
                              decay_period=1000000)
    else:
        print("unsupported param for --noise")
        assert False
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )

    expl_path_collector = MdpPathCollector(
        expl_env,
        exploration_policy,
    )
    replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env)
    trainer = TD3Trainer(policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         target_policy=target_policy,
                         **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=None,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=None,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
def experiment(variant):
    expl_env = envs[variant['env']](variant['dr'])
    expl_env = TransformObservationWrapper(expl_env, variant['main_t_fn'])

    observation_key = 'observation'
    desired_goal_key = 'desired_goal'
    achieved_goal_key = "achieved_goal"
    replay_buffer = imgObsDictRelabelingBuffer(
        env=expl_env,
        rerendering_env=rerendering_env,
        observation_key=observation_key,
        desired_goal_key=desired_goal_key,
        achieved_goal_key=achieved_goal_key,
        t_fn=variant['t_fn'],
        **variant['replay_buffer_kwargs'])
    # obs_dim = expl_env.observation_space.low.size
    action_dim = expl_env.action_space.low.size
    if variant['mlf']:
        if variant['alt'] == 'both':
            conv_args = {
                "input_width": 16,
                "input_height": 16,
                "input_channels": 16,
                "kernel_sizes": [4],
                "n_channels": [32],
                "strides": [4],
                "paddings": [0],
                "hidden_sizes": [1024, 512],
                # "added_fc_input_size": action_dim,
                "batch_norm_conv": False,
                "batch_norm_fc": False,
                'init_w': 1e-4,
                "hidden_init": nn.init.orthogonal_,
                "hidden_activation": nn.ReLU(),
            }
        else:
            conv_args = {
                "input_width": 16,
                "input_height": 16,
                "input_channels": 8,
                "kernel_sizes": [4],
                "n_channels": [32],
                "strides": [4],
                "paddings": [0],
                "hidden_sizes": [1024, 512],
                "batch_norm_conv": False,
                "batch_norm_fc": False,
                'init_w': 1e-4,
                "hidden_init": nn.init.orthogonal_,
                "hidden_activation": nn.ReLU(),
            }
    else:
        if variant['alt'] == 'both':
            conv_args = {
                "input_width": 64,
                "input_height": 64,
                "input_channels": 6,
                "kernel_sizes": [4, 4, 3],
                "n_channels": [32, 64, 64],
                "strides": [2, 1, 1],
                "paddings": [0, 0, 0],
                "hidden_sizes": [1024, 512],
                "batch_norm_conv": False,
                "batch_norm_fc": False,
                'init_w': 1e-4,
                "hidden_init": nn.init.orthogonal_,
                "hidden_activation": nn.ReLU(),
            }
        else:
            conv_args = {
                "input_width": 64,
                "input_height": 64,
                "input_channels": 3,
                "kernel_sizes": [4, 4, 3],
                "n_channels": [32, 64, 64],
                "strides": [2, 1, 1],
                "paddings": [0, 0, 0],
                "hidden_sizes": [1024, 512],
                "batch_norm_conv": False,
                "batch_norm_fc": False,
                'init_w': 1e-4,
                "hidden_init": nn.init.orthogonal_,
                "hidden_activation": nn.ReLU(),
            }

    qf1 = FlattenCNN(output_size=1,
                     added_fc_input_size=action_dim,
                     **variant['qf_kwargs'],
                     **conv_args)
    qf2 = FlattenCNN(output_size=1,
                     added_fc_input_size=action_dim,
                     **variant['qf_kwargs'],
                     **conv_args)
    target_qf1 = FlattenCNN(output_size=1,
                            added_fc_input_size=action_dim,
                            **variant['qf_kwargs'],
                            **conv_args)
    target_qf2 = FlattenCNN(output_size=1,
                            added_fc_input_size=action_dim,
                            **variant['qf_kwargs'],
                            **conv_args)
    policy = TanhCNNPolicy(output_size=action_dim,
                           **variant['policy_kwargs'],
                           **conv_args)
    target_policy = TanhCNNPolicy(output_size=action_dim,
                                  **variant['policy_kwargs'],
                                  **conv_args)
    if variant['noise'] == "eps":
        es = GaussianAndEpislonStrategy(
            action_space=expl_env.action_space,
            epsilon=0.3,
            max_sigma=0.0,
            min_sigma=0.0,  #constant sigma 0
            decay_period=1000000)
    elif variant['noise'] == "gaussian":
        es = GaussianStrategy(action_space=expl_env.action_space,
                              max_sigma=0.3,
                              min_sigma=0.1,
                              decay_period=1000000)
    else:
        print("unsupported param for --noise")
        assert False

    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    # eval_path_collector = MdpPathCollector(
    #     eval_env,
    #     policy,
    # )
    expl_path_collector = GoalConditionedPathCollector(
        expl_env,
        exploration_policy,
        observation_key=observation_key,
        desired_goal_key=desired_goal_key,
    )
    eval_path_collector = GoalConditionedPathCollector(
        expl_env,
        exploration_policy.policy,
        observation_key=observation_key,
        desired_goal_key=desired_goal_key,
    )

    trainer = TD3Trainer(policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         target_policy=target_policy,
                         **variant['trainer_kwargs'])
    # trainer = HERTrainer(trainer)
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=None,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=None,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()