コード例 #1
0
    def __init__(
        self,
        env,
        qf,
        vf,
        sac_kwargs,
        tdm_kwargs,
        base_kwargs,
        policy=None,
        replay_buffer=None,
        give_terminal_reward=False,
    ):
        SoftActorCritic.__init__(self,
                                 env=env,
                                 policy=policy,
                                 qf=qf,
                                 vf=vf,
                                 replay_buffer=replay_buffer,
                                 **sac_kwargs,
                                 **base_kwargs)
        TemporalDifferenceModel.__init__(self, **tdm_kwargs)
        action_space_diff = (self.env.action_space.high -
                             self.env.action_space.low)

        # TODO(vitchyr): Maybe add this to the main SAC code.
        terminal_reward = 0
        for dim in range(action_space_diff.size):
            terminal_reward += (-np.log(1. / action_space_diff[dim]))
        self.terminal_bonus = float(terminal_reward)
        self.give_terminal_reward = give_terminal_reward
コード例 #2
0
 def __init__(self,
              *args,
              observation_key=None,
              desired_goal_key=None,
              **kwargs):
     HER.__init__(
         self,
         observation_key=observation_key,
         desired_goal_key=desired_goal_key,
     )
     SoftActorCritic.__init__(self, *args, **kwargs)
     assert isinstance(self.replay_buffer, ObsDictRelabelingBuffer)
コード例 #3
0
def experiment(variant):
    env_params = variant['env_params']
    env = SawyerXYZReachingEnv(**env_params)
    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    qf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    algorithm = SoftActorCritic(env=env,
                                policy=policy,
                                qf=qf,
                                vf=vf,
                                **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
コード例 #4
0
ファイル: sac.py プロジェクト: Asap7772/rail-rl-franka-eval
def experiment(variant):
    env = NormalizedBoxEnv(gym.make('HalfCheetah-v2'))

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    qf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    algorithm = SoftActorCritic(env=env,
                                policy=policy,
                                qf=qf,
                                vf=vf,
                                **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
コード例 #5
0
def experiment(variant):
    env = NormalizedBoxEnv(variant['env_class'](**variant['env_kwargs']))
    if variant['multitask']:
        env = MultitaskToFlatEnv(env)

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    qf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    algorithm = SoftActorCritic(env=env,
                                policy=policy,
                                qf=qf,
                                vf=vf,
                                **variant['algo_params'])
    algorithm.to(ptu.device)
    algorithm.train()
コード例 #6
0
def experiment(variant):
    env = NormalizedBoxEnv(
        MultiGoalEnv(
            actuation_cost_coeff=10,
            distance_cost_coeff=1,
            goal_reward=10,
        ))

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    qf = FlattenMlp(
        hidden_sizes=[100, 100],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[100, 100],
        input_size=obs_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[100, 100],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    algorithm = SoftActorCritic(env=env,
                                policy=policy,
                                qf=qf,
                                vf=vf,
                                **variant['algo_params'])
    algorithm.to(ptu.device)
    with torch.autograd.profiler.profile() as prof:
        algorithm.train()
    prof.export_chrome_trace("tmp-torch-chrome-trace.prof")
コード例 #7
0
def experiment(variant):
    # env = normalize(GymEnv(
    #     'HalfCheetah-v1',
    #     force_reset=True,
    #     record_video=False,
    #     record_log=False,
    # ))
    env = NormalizedBoxEnv(gym.make('HalfCheetah-v1'))

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    qf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    algorithm = SoftActorCritic(env=env,
                                policy=policy,
                                qf=qf,
                                vf=vf,
                                **variant['algo_params'])
    algorithm.to(ptu.device)
    algorithm.train()
コード例 #8
0
def experiment(variant):
    env = NormalizedBoxEnv(variant['env_class']())
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size

    variant['algo_kwargs'] = dict(
        num_epochs=variant['num_epochs'],
        num_steps_per_epoch=variant['num_steps_per_epoch'],
        num_steps_per_eval=variant['num_steps_per_eval'],
        max_path_length=variant['max_path_length'],
        min_num_steps_before_training=variant['min_num_steps_before_training'],
        batch_size=variant['batch_size'],
        discount=variant['discount'],
        replay_buffer_size=variant['replay_buffer_size'],
        soft_target_tau=variant['soft_target_tau'],
        target_update_period=variant['target_update_period'],
        train_policy_with_reparameterization=variant[
            'train_policy_with_reparameterization'],
        policy_lr=variant['policy_lr'],
        qf_lr=variant['qf_lr'],
        vf_lr=variant['vf_lr'],
        reward_scale=variant['reward_scale'],
        use_automatic_entropy_tuning=variant.get(
            'use_automatic_entropy_tuning', False))

    M = variant['layer_size']
    qf = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
        # **variant['qf_kwargs']
    )
    vf = FlattenMlp(
        input_size=obs_dim,
        output_size=1,
        hidden_sizes=[M, M],
        # **variant['vf_kwargs']
    )
    policy = TanhGaussianPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        hidden_sizes=[M, M],
        # **variant['policy_kwargs']
    )
    algorithm = SoftActorCritic(env,
                                policy=policy,
                                qf=qf,
                                vf=vf,
                                **variant['algo_kwargs'])
    if ptu.gpu_enabled():
        qf.cuda()
        vf.cuda()
        policy.cuda()
        algorithm.cuda()
    algorithm.train()
コード例 #9
0
def experiment(variant):
    imsize = variant['imsize']
    history = variant['history']

    env = Pusher2DEnv()#gym.make(variant['env_id']).env
    env = NormalizedBoxEnv(ImageMujocoEnv(env,
                                    imsize=imsize,
                                    keep_prev=history - 1,
                                    init_camera=variant['init_camera']))
#    es = GaussianStrategy(
#        action_space=env.action_space,
#    )
    es = OUStrategy(action_space=env.action_space)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size

    qf = MergedCNN(input_width=imsize,
                   input_height=imsize,
                   output_size=1,
                   input_channels= history,
                   added_fc_input_size=action_dim,
                   **variant['cnn_params'])

    vf  = CNN(input_width=imsize,
               input_height=imsize,
               output_size=1,
               input_channels=history,
               **variant['cnn_params'])

    policy = TanhCNNGaussianPolicy(input_width=imsize,
                                   input_height=imsize,
                                   output_size=action_dim,
                                   input_channels=history,
                                   **variant['cnn_params'],
    )


    algorithm = SoftActorCritic(
        env=env,
        policy=policy,
        qf=qf,
        vf=vf,
        **variant['algo_params']
    )

    algorithm.to(ptu.device)
    algorithm.train()
コード例 #10
0
def experiment(variant):
    env = NormalizedBoxEnv(
        MultiGoalEnv(
            actuation_cost_coeff=10,
            distance_cost_coeff=1,
            goal_reward=10,
        ))

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    qf = FlattenMlp(
        hidden_sizes=[100, 100],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[100, 100],
        input_size=obs_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[100, 100],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    plotter = QFPolicyPlotter(qf=qf,
                              policy=policy,
                              obs_lst=np.array([[-2.5, 0.0], [0.0, 0.0],
                                                [2.5, 2.5]]),
                              default_action=[np.nan, np.nan],
                              n_samples=100)
    algorithm = SoftActorCritic(
        env=env,
        policy=policy,
        qf=qf,
        vf=vf,
        # plotter=plotter,
        # render_eval_paths=True,
        **variant['algo_params'])
    algorithm.to(ptu.device)
    algorithm.train()
コード例 #11
0
def experiment(variant):
    env = variant['env_class']()
    env = NormalizedBoxEnv(env)

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    qf = FlattenMlp(input_size=obs_dim + action_dim,
                    output_size=1,
                    **variant['qf_kwargs'])
    vf = FlattenMlp(input_size=obs_dim, output_size=1, **variant['vf_kwargs'])
    policy = TanhGaussianPolicy(obs_dim=obs_dim,
                                action_dim=action_dim,
                                **variant['policy_kwargs'])
    algorithm = SoftActorCritic(env=env,
                                policy=policy,
                                qf=qf,
                                vf=vf,
                                **variant['algo_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
コード例 #12
0
ファイル: sac.py プロジェクト: Asap7772/rail-rl-franka-eval
def experiment(variant):
    env = SawyerXYZEnv(**variant['env_kwargs'])
    env = MultitaskToFlatEnv(env)
    if variant['normalize']:
        env = NormalizedBoxEnv(env)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf = FlattenMlp(input_size=obs_dim + action_dim,
                    output_size=1,
                    **variant['qf_kwargs'])
    vf = FlattenMlp(input_size=obs_dim, output_size=1, **variant['vf_kwargs'])
    policy = TanhGaussianPolicy(obs_dim=obs_dim,
                                action_dim=action_dim,
                                **variant['policy_kwargs'])
    algorithm = SoftActorCritic(env=env,
                                policy=policy,
                                qf=qf,
                                vf=vf,
                                **variant['algo_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
コード例 #13
0
 def evaluate(self, epoch):
     SoftActorCritic.evaluate(self, epoch)