Beispiel #1
0
    def __init__(
        self,
        n_dim_obs,
        n_dim_action,
        n_hidden_channels,
        n_hidden_layers,
        nonlinearity=F.relu,
        last_wscale=1.0,
    ):
        assert n_hidden_layers >= 1
        self.n_input_channels = n_dim_obs + n_dim_action
        self.n_hidden_layers = n_hidden_layers
        self.n_hidden_channels = n_hidden_channels
        self.nonlinearity = nonlinearity

        super().__init__()
        # No need to pass nonlinearity to obs_mlp because it has no
        # hidden layers
        self.obs_mlp = MLP(in_size=n_dim_obs,
                           out_size=n_hidden_channels,
                           hidden_sizes=[])
        self.mlp = MLP(
            in_size=n_hidden_channels + n_dim_action,
            out_size=1,
            hidden_sizes=([self.n_hidden_channels] *
                          (self.n_hidden_layers - 1)),
            nonlinearity=nonlinearity,
            last_wscale=last_wscale,
        )

        self.output = self.mlp.output
    def __init__(self,
                 n_actions,
                 n_input_channels=4,
                 activation=F.relu,
                 bias=0.1,
                 reward_boundaries=None,
                 reward_channel_scale=1.):
        self.n_actions = n_actions
        self.n_input_channels = n_input_channels
        self.activation = activation
        self.boundaries = torch.from_numpy(
            np.array(reward_boundaries)) * reward_channel_scale - 1e-8

        super().__init__()
        self.conv_layers = nn.ModuleList([
            nn.Conv2d(n_input_channels, 32, 8, stride=4),
            nn.Conv2d(32, 64, 4, stride=2),
            nn.Conv2d(64, 64, 3, stride=1),
        ])

        # Modified from 3136 -> 1024
        self.a_streams = nn.ModuleList([
            MLP(1024, n_actions, [512])
            for _ in range(len(self.boundaries) + 1)
        ])
        self.v_streams = nn.ModuleList(
            [MLP(1024, 1, [512]) for _ in range(len(self.boundaries) + 1)])

        self.conv_layers.apply(init_chainer_default)  # MLP already applies
        self.conv_layers.apply(constant_bias_initializer(bias=bias))
Beispiel #3
0
    def __init__(
        self,
        n_dim_obs,
        n_dim_action,
        n_hidden_channels,
        n_hidden_layers,
        nonlinearity=F.relu,
        last_wscale=1.0,
    ):
        raise NotImplementedError()
        self.n_input_channels = n_dim_obs + n_dim_action
        self.n_hidden_layers = n_hidden_layers
        self.n_hidden_channels = n_hidden_channels
        self.nonlinearity = nonlinearity
        super().__init__()
        self.fc = MLP(
            self.n_input_channels,
            n_hidden_channels,
            [self.n_hidden_channels] * self.n_hidden_layers,
            nonlinearity=nonlinearity,
        )
        self.lstm = nn.LSTM(num_layers=1,
                            input_size=n_hidden_channels,
                            hidden_size=n_hidden_channels)
        self.out = nn.Linear(n_hidden_channels, 1)
        for (n, p) in self.lstm.named_parameters():
            if "weight" in n:
                init_lecun_normal(p)
            else:
                nn.init.zeros_(p)

        init_lecun_normal(self.out.weight, scale=last_wscale)
        nn.init.zeros_(self.out.bias)
Beispiel #4
0
 def __init__(
     self,
     ndim_obs,
     n_actions,
     n_atoms,
     v_min,
     v_max,
     n_hidden_channels,
     n_hidden_layers,
     nonlinearity=F.relu,
     last_wscale=1.0,
 ):
     assert n_atoms >= 2
     assert v_min < v_max
     z_values = np.linspace(v_min, v_max, num=n_atoms, dtype=np.float32)
     model = nn.Sequential(
         MLP(
             in_size=ndim_obs,
             out_size=n_actions * n_atoms,
             hidden_sizes=[n_hidden_channels] * n_hidden_layers,
             nonlinearity=nonlinearity,
             last_wscale=last_wscale,
         ),
         Lambda(lambda x: torch.reshape(x, (-1, n_actions, n_atoms))),
         nn.Softmax(dim=2),
     )
     super().__init__(model=model, z_values=z_values)
Beispiel #5
0
    def __init__(self, n_actions, n_input_channels=4, activation=F.relu, bias=0.1):
        self.n_actions = n_actions
        self.n_input_channels = n_input_channels
        self.activation = activation

        super().__init__()
        self.conv_layers = nn.ModuleList(
            [
                nn.Conv2d(n_input_channels, 32, 8, stride=4),
                nn.Conv2d(32, 64, 4, stride=2),
                nn.Conv2d(64, 64, 3, stride=1),
            ]
        )

        self.a_stream = MLP(3136, n_actions, [512])
        self.v_stream = MLP(3136, 1, [512])

        self.conv_layers.apply(init_chainer_default)  # MLP already applies
        self.conv_layers.apply(constant_bias_initializer(bias=bias))
Beispiel #6
0
 def __init__(
     self,
     ndim_obs,
     n_actions,
     n_hidden_channels,
     n_hidden_layers,
     nonlinearity=F.relu,
     last_wscale=1.0,
 ):
     super().__init__(model=MLP(
         in_size=ndim_obs,
         out_size=n_actions,
         hidden_sizes=[n_hidden_channels] * n_hidden_layers,
         nonlinearity=nonlinearity,
         last_wscale=last_wscale,
     ))
Beispiel #7
0
def _objective_core(
    # optuna parameters
    trial,
    # training parameters
    env_id,
    outdir,
    seed,
    monitor,
    gpu,
    steps,
    train_max_episode_len,
    eval_n_episodes,
    eval_interval,
    batch_size,
    # hyperparameters
    hyperparams,
):
    # Set a random seed used in PFRL
    utils.set_random_seed(seed)

    # Set different random seeds for train and test envs.
    train_seed = seed
    test_seed = 2**31 - 1 - seed

    def make_env(test=False):
        env = gym.make(env_id)

        if not isinstance(env.observation_space, gym.spaces.Box):
            raise ValueError(
                "Supported only Box observation environments, but given: {}".format(
                    env.observation_space
                )
            )
        if len(env.observation_space.shape) != 1:
            raise ValueError(
                "Supported only observation spaces with ndim==1, but given: {}".format(
                    env.observation_space.shape
                )
            )
        if not isinstance(env.action_space, gym.spaces.Discrete):
            raise ValueError(
                "Supported only discrete action environments, but given: {}".format(
                    env.action_space
                )
            )

        env_seed = test_seed if test else train_seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = pfrl.wrappers.CastObservationToFloat32(env)
        if monitor:
            env = pfrl.wrappers.Monitor(env, outdir)
        if not test:
            # Scale rewards (and thus returns) to a reasonable range so that
            # training is easier
            env = pfrl.wrappers.ScaleReward(env, hyperparams["reward_scale_factor"])
        return env

    env = make_env(test=False)
    obs_space = env.observation_space
    obs_size = obs_space.low.size
    action_space = env.action_space
    n_actions = action_space.n

    # create model & q_function
    model = MLP(
        in_size=obs_size, out_size=n_actions, hidden_sizes=hyperparams["hidden_sizes"]
    )
    q_func = q_functions.SingleModelStateQFunctionWithDiscreteAction(model=model)

    # Use epsilon-greedy for exploration
    start_epsilon = 1
    explorer = explorers.LinearDecayEpsilonGreedy(
        start_epsilon=start_epsilon,
        end_epsilon=hyperparams["end_epsilon"],
        decay_steps=hyperparams["decay_steps"],
        random_action_func=action_space.sample,
    )

    opt = optim.Adam(
        q_func.parameters(), lr=hyperparams["lr"], eps=hyperparams["adam_eps"]
    )

    rbuf_capacity = steps
    rbuf = replay_buffers.ReplayBuffer(rbuf_capacity)

    agent = DQN(
        q_func,
        opt,
        rbuf,
        gpu=gpu,
        gamma=hyperparams["gamma"],
        explorer=explorer,
        replay_start_size=hyperparams["replay_start_size"],
        target_update_interval=hyperparams["target_update_interval"],
        update_interval=hyperparams["update_interval"],
        minibatch_size=batch_size,
    )

    eval_env = make_env(test=True)

    evaluation_hooks = [OptunaPrunerHook(trial=trial)]
    _, eval_stats_history = experiments.train_agent_with_evaluation(
        agent=agent,
        env=env,
        steps=steps,
        eval_n_steps=None,
        eval_n_episodes=eval_n_episodes,
        eval_interval=eval_interval,
        outdir=outdir,
        eval_env=eval_env,
        train_max_episode_len=train_max_episode_len,
        evaluation_hooks=evaluation_hooks,
    )

    score = _get_score_from_eval_stats_history(eval_stats_history)

    return score