Beispiel #1
0
    def make_model(self, env):
        n_hidden_channels = 20
        obs_size = env.observation_space.low.size

        if self.recurrent:
            v = StatelessRecurrentSequential(
                L.NStepLSTM(1, obs_size, n_hidden_channels, 0),
                L.Linear(
                    None, 1, initialW=chainer.initializers.LeCunNormal(1e-1)),
            )
            if self.discrete:
                n_actions = env.action_space.n
                pi = StatelessRecurrentSequential(
                    L.NStepLSTM(1, obs_size, n_hidden_channels, 0),
                    policies.FCSoftmaxPolicy(
                        n_hidden_channels, n_actions,
                        n_hidden_layers=0,
                        nonlinearity=F.tanh,
                        last_wscale=1e-1,
                    )
                )
            else:
                action_size = env.action_space.low.size
                pi = StatelessRecurrentSequential(
                    L.NStepLSTM(1, obs_size, n_hidden_channels, 0),
                    policies.FCGaussianPolicy(
                        n_hidden_channels, action_size,
                        n_hidden_layers=0,
                        nonlinearity=F.tanh,
                        mean_wscale=1e-1,
                    )
                )
            return StatelessRecurrentBranched(pi, v)
        else:
            v = chainer.Sequential(
                L.Linear(None, n_hidden_channels),
                F.tanh,
                L.Linear(
                    None, 1, initialW=chainer.initializers.LeCunNormal(1e-1)),
            )
            if self.discrete:
                n_actions = env.action_space.n
                pi = policies.FCSoftmaxPolicy(
                    obs_size, n_actions,
                    n_hidden_layers=1,
                    n_hidden_channels=n_hidden_channels,
                    nonlinearity=F.tanh,
                    last_wscale=1e-1,
                )
            else:
                action_size = env.action_space.low.size
                pi = policies.FCGaussianPolicy(
                    obs_size, action_size,
                    n_hidden_layers=1,
                    n_hidden_channels=n_hidden_channels,
                    nonlinearity=F.tanh,
                    mean_wscale=1e-1,
                )
            return A3CSeparateModel(pi=pi, v=v)
Beispiel #2
0
    def make_model(self, env):
        n_hidden_channels = 50

        n_dim_obs = env.observation_space.low.size
        v = v_functions.FCVFunction(
            n_dim_obs,
            n_hidden_layers=2,
            n_hidden_channels=n_hidden_channels)

        if self.discrete:
            n_actions = env.action_space.n

            pi = policies.FCSoftmaxPolicy(
                n_dim_obs, n_actions,
                n_hidden_layers=2,
                n_hidden_channels=n_hidden_channels)
        else:
            n_dim_actions = env.action_space.low.size

            pi = policies.FCGaussianPolicy(
                n_dim_obs, n_dim_actions,
                n_hidden_layers=2,
                n_hidden_channels=n_hidden_channels)

        return A3CSeparateModel(pi=pi, v=v)
Beispiel #3
0
    def __init__(self,
                 env,
                 feature_transformer,
                 gamma=0.99,
                 optimizer='adam',
                 max_memory=10000):
        BaseAgent.__init__(self,
                           env=env,
                           feature_transformer=feature_transformer,
                           gamma=gamma,
                           optimizer=optimizer)

        self.model = policies.FCSoftmaxPolicy(self.n_dims,
                                              self.n_actions,
                                              n_hidden_layers=2,
                                              n_hidden_channels=100,
                                              nonlinearity=F.relu)

        self.optimizer.setup(self.model)
        #self.optimizer.add_hook(chainer.optimizer.GradientClipping(40))

        self.replay_buffer = PrioritizedEpisodicReplayBuffer(
            capacity=max_memory,
            uniform_ratio=0.1,
            default_priority_func=exp_return_of_episode,
            wait_priority_after_sampling=False,
            return_sample_weights=False)
        self.agent = reinforce.REINFORCE(model=self.model,
                                         optimizer=self.optimizer,
                                         phi=phi,
                                         batchsize=1,
                                         act_deterministically=False)
 def __init__(self, trial, width, height, action_size, lstm_size=128):
     obs_size = width * height
     self.head = MyHead(trial, width=width, height=height)
     self.lstm = L.LSTM(self.head.n_output_channels, lstm_size)
     self.pi = policies.FCSoftmaxPolicy(lstm_size, action_size)
     self.v = v_function.FCVFunction(lstm_size)
     super().__init__(self.head, self.lstm, self.pi, self.v)
Beispiel #5
0
def create_stochastic_policy_for_env(env):
    assert isinstance(env.observation_space, gym.spaces.Box)
    ndim_obs = env.observation_space.low.size
    if isinstance(env.action_space, gym.spaces.Discrete):
        return policies.FCSoftmaxPolicy(ndim_obs, env.action_space.n)
    elif isinstance(env.action_space, gym.spaces.Box):
        return policies.FCGaussianPolicy(ndim_obs,
                                         env.action_space.low.size,
                                         bound_mean=False)
    else:
        raise NotImplementedError()
Beispiel #6
0
    def __init__(self,
                 env,
                 feature_transformer,
                 gamma=0.99,
                 optimizer='adam',
                 max_memory=10000):
        BaseAgent.__init__(self,
                           env=env,
                           feature_transformer=feature_transformer,
                           gamma=gamma,
                           optimizer=optimizer)

        self.model = agents.pcl.PCLSeparateModel(
            pi=policies.FCSoftmaxPolicy(self.n_dims,
                                        self.n_actions,
                                        n_hidden_channels=100,
                                        n_hidden_layers=2),
            v=v_functions.FCVFunction(
                self.n_dims,
                n_hidden_channels=100,
                n_hidden_layers=2,
            ),
        )

        self.optimizer.setup(self.model)
        #self.optimizer.add_hook(chainer.optimizer.GradientClipping(40))

        self.replay_buffer = \
            chainerrl.replay_buffer.PrioritizedEpisodicReplayBuffer(
                capacity=max_memory,
                uniform_ratio=0.1,
                default_priority_func=exp_return_of_episode,
                wait_priority_after_sampling=False,
                return_sample_weights=False)

        self.agent = agents.pcl.PCL(model=self.model,
                                    optimizer=self.optimizer,
                                    replay_buffer=self.replay_buffer,
                                    t_max=1,
                                    gamma=self.gamma,
                                    tau=1e-2,
                                    phi=phi,
                                    rollout_len=10,
                                    batchsize=1,
                                    disable_online_update=False,
                                    n_times_replay=1,
                                    replay_start_size=1000,
                                    normalize_loss_by_steps=True,
                                    act_deterministically=False,
                                    backprop_future_values=False,
                                    train_async=True)
Beispiel #7
0
    def make_model(self, env):
        n_hidden_channels = 20

        n_dim_obs = env.observation_space.low.size
        v = v_functions.FCVFunction(
            n_dim_obs,
            n_hidden_layers=1,
            n_hidden_channels=n_hidden_channels,
            nonlinearity=F.tanh,
            last_wscale=0.01,
        )

        if self.discrete:
            n_actions = env.action_space.n

            pi = policies.FCSoftmaxPolicy(
                n_dim_obs,
                n_actions,
                n_hidden_layers=1,
                n_hidden_channels=n_hidden_channels,
                nonlinearity=F.tanh,
                last_wscale=0.01,
            )
        else:
            n_dim_actions = env.action_space.low.size

            pi = policies.FCGaussianPolicyWithStateIndependentCovariance(
                n_dim_obs,
                n_dim_actions,
                n_hidden_layers=1,
                n_hidden_channels=n_hidden_channels,
                nonlinearity=F.tanh,
                mean_wscale=0.01,
                var_type='diagonal',
            )

        # Check if KL div supports double-backprop
        fake_obs = np.zeros_like(env.observation_space.low, dtype=np.float32)
        action_distrib = pi(fake_obs[None])
        kl = action_distrib.kl(action_distrib)
        old_style_funcs = trpo._find_old_style_function([kl])
        if old_style_funcs:
            self.skipTest("\
Chainer v{} does not support double backprop of these functions: {}.".format(
                chainer.__version__, old_style_funcs))

        return pi, v
Beispiel #8
0
    def __init__(self, n_dims, n_actions):
        self.head = links.Sequence(
            L.ConvolutionND(ndim=1,
                            in_channels=n_dims,
                            out_channels=100,
                            ksize=3,
                            stride=1,
                            pad=1,
                            cover_all=True), F.relu)
        self.pi = policies.FCSoftmaxPolicy(n_input_channels=100,
                                           n_actions=n_actions,
                                           n_hidden_layers=2,
                                           n_hidden_channels=100)
        self.v = v_functions.FCVFunction(n_input_channels=100,
                                         n_hidden_layers=2,
                                         n_hidden_channels=100)

        super(A3CFF, self).__init__(self.head, self.pi, self.v)
Beispiel #9
0
 def __init__(self,
              ndim_obs,
              n_discrete_entries,
              n_hidden_layers=2,
              n_hidden_channels=400,
              beta=1.0):
     self.pi = policies.FCSoftmaxPolicy(ndim_obs,
                                        n_discrete_entries,
                                        n_hidden_layers,
                                        n_hidden_channels,
                                        beta=beta)
     self.v = chainerrl.v_functions.FCVFunction(
         ndim_obs,
         n_hidden_channels=n_hidden_channels,
         n_hidden_layers=n_hidden_layers,
         last_wscale=0.01,
     )
     super().__init__(self.pi, self.v)
Beispiel #10
0
    def _test_abc(self,
                  t_max,
                  use_lstm,
                  discrete=True,
                  episodic=True,
                  steps=100000,
                  require_success=True):

        nproc = 8

        def make_env(process_idx, test):
            size = 2
            return ABC(size=size,
                       discrete=discrete,
                       episodic=episodic or test,
                       partially_observable=self.use_lstm,
                       deterministic=test)

        sample_env = make_env(0, False)
        action_space = sample_env.action_space
        obs_space = sample_env.observation_space

        def phi(x):
            return x

        n_hidden_channels = 20
        n_hidden_layers = 2
        nonlinearity = F.relu
        if use_lstm:
            if discrete:
                model = a3c.A3CSharedModel(
                    shared=L.LSTM(obs_space.low.size, n_hidden_channels),
                    pi=policies.FCSoftmaxPolicy(
                        n_hidden_channels,
                        action_space.n,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=n_hidden_layers,
                        nonlinearity=nonlinearity,
                        last_wscale=1e-2,
                    ),
                    v=v_function.FCVFunction(
                        n_hidden_channels,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=n_hidden_layers,
                        nonlinearity=nonlinearity,
                        last_wscale=1e-2,
                    ),
                )
            else:
                model = a3c.A3CSharedModel(
                    shared=L.LSTM(obs_space.low.size, n_hidden_channels),
                    pi=policies.FCGaussianPolicy(
                        n_hidden_channels,
                        action_space.low.size,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=n_hidden_layers,
                        nonlinearity=nonlinearity,
                        var_wscale=1e-2,
                        var_bias=1,
                        bound_mean=True,
                        min_action=action_space.low,
                        max_action=action_space.high,
                        min_var=1e-1,
                    ),
                    v=v_function.FCVFunction(
                        n_hidden_channels,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=n_hidden_layers,
                        nonlinearity=nonlinearity,
                        last_wscale=1e-2,
                    ),
                )
        else:
            if discrete:
                model = a3c.A3CSeparateModel(
                    pi=policies.FCSoftmaxPolicy(
                        obs_space.low.size,
                        action_space.n,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=n_hidden_layers,
                        nonlinearity=nonlinearity,
                        last_wscale=1e-2,
                    ),
                    v=v_function.FCVFunction(
                        obs_space.low.size,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=n_hidden_layers,
                        nonlinearity=nonlinearity,
                        last_wscale=1e-2,
                    ),
                )
            else:
                model = a3c.A3CSeparateModel(
                    pi=policies.FCGaussianPolicy(
                        obs_space.low.size,
                        action_space.low.size,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=n_hidden_layers,
                        nonlinearity=nonlinearity,
                        var_wscale=1e-2,
                        var_bias=1,
                        bound_mean=True,
                        min_action=action_space.low,
                        max_action=action_space.high,
                        min_var=1e-1,
                    ),
                    v=v_function.FCVFunction(
                        obs_space.low.size,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=n_hidden_layers,
                        nonlinearity=nonlinearity,
                        last_wscale=1e-2,
                    ),
                )
        eps = 1e-8 if self.backprop_future_values else 1e-1
        opt = rmsprop_async.RMSpropAsync(lr=5e-4, eps=eps, alpha=0.99)
        opt.setup(model)
        gamma = 0.5
        tau = 1e-2
        replay_buffer = chainerrl.replay_buffer.EpisodicReplayBuffer(10**5)
        agent = pcl.PCL(model,
                        opt,
                        replay_buffer=replay_buffer,
                        t_max=t_max,
                        gamma=gamma,
                        tau=tau,
                        phi=phi,
                        n_times_replay=1,
                        batchsize=self.batchsize,
                        train_async=self.train_async,
                        backprop_future_values=self.backprop_future_values,
                        act_deterministically=True)

        if self.train_async:
            with warnings.catch_warnings(record=True) as warns:
                chainerrl.experiments.train_agent_async(outdir=self.outdir,
                                                        processes=nproc,
                                                        make_env=make_env,
                                                        agent=agent,
                                                        steps=steps,
                                                        max_episode_len=2,
                                                        eval_interval=200,
                                                        eval_n_runs=5,
                                                        successful_score=1)
                assert len(warns) == 0, warns[0]
            # The agent returned by train_agent_async is not guaranteed to be
            # successful because parameters could be modified by other
            # processes after success. Thus here the successful model is loaded
            # explicitly.
            if require_success:
                agent.load(os.path.join(self.outdir, 'successful'))
        else:
            agent.process_idx = 0
            chainerrl.experiments.train_agent_with_evaluation(
                agent=agent,
                env=make_env(0, False),
                eval_env=make_env(0, True),
                outdir=self.outdir,
                steps=steps,
                max_episode_len=2,
                eval_interval=200,
                eval_n_runs=5,
                successful_score=1)

        agent.stop_episode()

        # Test
        env = make_env(0, True)
        n_test_runs = 5

        for _ in range(n_test_runs):
            total_r = 0
            obs = env.reset()
            done = False
            reward = 0.0

            while not done:
                action = agent.act(obs)
                print('state:', obs, 'action:', action)
                obs, reward, done, _ = env.step(action)
                total_r += reward
            if require_success:
                self.assertAlmostEqual(total_r, 1)
            agent.stop_episode()
    def _test_abc(self,
                  t_max,
                  use_lstm,
                  discrete=True,
                  episodic=True,
                  steps=100000,
                  require_success=True):

        nproc = 8

        def make_env(process_idx, test):
            size = 2
            return ABC(size=size,
                       discrete=discrete,
                       episodic=episodic or test,
                       partially_observable=self.use_lstm,
                       deterministic=test)

        sample_env = make_env(0, False)
        action_space = sample_env.action_space
        obs_space = sample_env.observation_space

        def phi(x):
            return x

        n_hidden_channels = 20
        n_hidden_layers = 1
        nonlinearity = F.leaky_relu
        replay_buffer = EpisodicReplayBuffer(10**4)
        if use_lstm:
            if discrete:
                model = acer.ACERSharedModel(
                    shared=L.LSTM(obs_space.low.size, n_hidden_channels),
                    pi=policies.FCSoftmaxPolicy(
                        n_hidden_channels,
                        action_space.n,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=n_hidden_layers,
                        nonlinearity=nonlinearity,
                        min_prob=1e-1),
                    q=q_function.FCStateQFunctionWithDiscreteAction(
                        n_hidden_channels,
                        action_space.n,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=n_hidden_layers,
                        nonlinearity=nonlinearity),
                )
            else:
                model = acer.ACERSDNSharedModel(
                    shared=L.LSTM(obs_space.low.size, n_hidden_channels),
                    pi=policies.FCGaussianPolicy(
                        n_hidden_channels,
                        action_space.low.size,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=n_hidden_layers,
                        bound_mean=True,
                        min_action=action_space.low,
                        max_action=action_space.high,
                        nonlinearity=nonlinearity,
                        min_var=1e-1),
                    v=v_function.FCVFunction(
                        n_hidden_channels,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=n_hidden_layers,
                        nonlinearity=nonlinearity),
                    adv=q_function.FCSAQFunction(
                        n_hidden_channels,
                        action_space.low.size,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=n_hidden_layers,
                        nonlinearity=nonlinearity),
                )
        else:
            if discrete:
                model = acer.ACERSeparateModel(
                    pi=policies.FCSoftmaxPolicy(
                        obs_space.low.size,
                        action_space.n,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=n_hidden_layers,
                        nonlinearity=nonlinearity,
                        min_prob=1e-1),
                    q=q_function.FCStateQFunctionWithDiscreteAction(
                        obs_space.low.size,
                        action_space.n,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=n_hidden_layers,
                        nonlinearity=nonlinearity),
                )
            else:
                model = acer.ACERSDNSeparateModel(
                    pi=policies.FCGaussianPolicy(
                        obs_space.low.size,
                        action_space.low.size,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=n_hidden_layers,
                        bound_mean=True,
                        min_action=action_space.low,
                        max_action=action_space.high,
                        nonlinearity=nonlinearity,
                        min_var=1e-1),
                    v=v_function.FCVFunction(
                        obs_space.low.size,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=n_hidden_layers,
                        nonlinearity=nonlinearity),
                    adv=q_function.FCSAQFunction(
                        obs_space.low.size,
                        action_space.low.size,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=n_hidden_layers,
                        nonlinearity=nonlinearity),
                )
        eps = 1e-8
        opt = rmsprop_async.RMSpropAsync(lr=1e-3, eps=eps, alpha=0.99)
        opt.setup(model)
        gamma = 0.5
        beta = 1e-5
        if self.n_times_replay == 0 and self.disable_online_update:
            # At least one of them must be enabled
            return
        agent = acer.ACER(model,
                          opt,
                          replay_buffer=replay_buffer,
                          t_max=t_max,
                          gamma=gamma,
                          beta=beta,
                          phi=phi,
                          n_times_replay=self.n_times_replay,
                          act_deterministically=True,
                          disable_online_update=self.disable_online_update,
                          replay_start_size=100,
                          use_trust_region=self.use_trust_region)

        max_episode_len = None if episodic else 2

        with warnings.catch_warnings(record=True) as warns:
            train_agent_async(outdir=self.outdir,
                              processes=nproc,
                              make_env=make_env,
                              agent=agent,
                              steps=steps,
                              max_episode_len=max_episode_len,
                              eval_interval=500,
                              eval_n_steps=None,
                              eval_n_episodes=5,
                              successful_score=1)
            assert len(warns) == 0, warns[0]

        # The agent returned by train_agent_async is not guaranteed to be
        # successful because parameters could be modified by other processes
        # after success. Thus here the successful model is loaded explicitly.
        if require_success:
            agent.load(os.path.join(self.outdir, 'successful'))
        agent.stop_episode()

        # Test
        env = make_env(0, True)
        n_test_runs = 5

        for _ in range(n_test_runs):
            total_r = 0
            obs = env.reset()
            done = False
            reward = 0.0

            while not done:
                action = agent.act(obs)
                print('state:', obs, 'action:', action)
                obs, reward, done, _ = env.step(action)
                total_r += reward
            if require_success:
                self.assertAlmostEqual(total_r, 1)
            agent.stop_episode()
Beispiel #12
0
    def _test_abc(self,
                  t_max,
                  use_lstm,
                  discrete=True,
                  episodic=True,
                  steps=100000,
                  require_success=True):

        nproc = 8

        def make_env(process_idx, test):
            size = 2
            return ABC(size=size,
                       discrete=discrete,
                       episodic=episodic or test,
                       partially_observable=self.use_lstm,
                       deterministic=test)

        sample_env = make_env(0, False)
        action_space = sample_env.action_space
        obs_space = sample_env.observation_space

        def phi(x):
            return x

        n_hidden_channels = 20
        if use_lstm:
            if discrete:
                model = a3c.A3CSharedModel(
                    shared=L.LSTM(obs_space.low.size, n_hidden_channels),
                    pi=policies.FCSoftmaxPolicy(
                        n_hidden_channels,
                        action_space.n,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2,
                        nonlinearity=F.tanh,
                        last_wscale=1e-1,
                    ),
                    v=v_function.FCVFunction(
                        n_hidden_channels,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2,
                        nonlinearity=F.tanh,
                        last_wscale=1e-1,
                    ),
                )
            else:
                model = a3c.A3CSharedModel(
                    shared=L.LSTM(obs_space.low.size, n_hidden_channels),
                    pi=policies.FCGaussianPolicy(
                        n_hidden_channels,
                        action_space.low.size,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2,
                        nonlinearity=F.tanh,
                        mean_wscale=1e-1,
                    ),
                    v=v_function.FCVFunction(
                        n_hidden_channels,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2,
                        nonlinearity=F.tanh,
                        last_wscale=1e-1,
                    ),
                )
        else:
            if discrete:
                model = a3c.A3CSeparateModel(
                    pi=policies.FCSoftmaxPolicy(
                        obs_space.low.size,
                        action_space.n,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2,
                        nonlinearity=F.tanh,
                        last_wscale=1e-1,
                    ),
                    v=v_function.FCVFunction(
                        obs_space.low.size,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2,
                        nonlinearity=F.tanh,
                        last_wscale=1e-1,
                    ),
                )
            else:
                model = a3c.A3CSeparateModel(
                    pi=policies.FCGaussianPolicy(
                        obs_space.low.size,
                        action_space.low.size,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2,
                        nonlinearity=F.tanh,
                        mean_wscale=1e-1,
                    ),
                    v=v_function.FCVFunction(
                        obs_space.low.size,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2,
                        nonlinearity=F.tanh,
                        last_wscale=1e-1,
                    ),
                )
        opt = chainer.optimizers.Adam()
        opt.setup(model)
        opt.add_hook(chainer.optimizer.GradientClipping(1))
        gamma = 0.8
        beta = 1e-2
        agent = a3c.A3C(model,
                        opt,
                        t_max=t_max,
                        gamma=gamma,
                        beta=beta,
                        phi=phi,
                        act_deterministically=True)

        max_episode_len = None if episodic else 2

        with warnings.catch_warnings(record=True) as warns:
            train_agent_async(outdir=self.outdir,
                              processes=nproc,
                              make_env=make_env,
                              agent=agent,
                              steps=steps,
                              max_episode_len=max_episode_len,
                              eval_interval=500,
                              eval_n_steps=None,
                              eval_n_episodes=5,
                              successful_score=1)
            assert len(warns) == 0, warns[0]

        # The agent returned by train_agent_async is not guaranteed to be
        # successful because parameters could be modified by other processes
        # after success. Thus here the successful model is loaded explicitly.
        if require_success:
            agent.load(os.path.join(self.outdir, 'successful'))
        agent.stop_episode()

        # Test
        env = make_env(0, True)
        n_test_runs = 5

        for _ in range(n_test_runs):
            total_r = 0
            obs = env.reset()
            done = False
            reward = 0.0

            while not done:
                action = agent.act(obs)
                print('state:', obs, 'action:', action)
                obs, reward, done, _ = env.step(action)
                total_r += reward
            if require_success:
                self.assertAlmostEqual(total_r, 1)
            agent.stop_episode()
Beispiel #13
0
 def __init__(self, n_actions):
     self.head = links.NIPSDQNHead()
     self.pi = policies.FCSoftmaxPolicy(self.head.n_output_channels,
                                        n_actions)
     self.v = v_functions.FCVFunction(self.head.n_output_channels)
     super().__init__(self.head, self.pi, self.v)
Beispiel #14
0
    def _test_abc(self,
                  use_lstm,
                  discrete=True,
                  steps=1000000,
                  require_success=True,
                  gpu=-1):
        def make_env(process_idx, test):
            size = 2
            return ABC(size=size,
                       discrete=discrete,
                       episodic=True,
                       partially_observable=self.use_lstm,
                       deterministic=test)

        sample_env = make_env(0, False)
        action_space = sample_env.action_space
        obs_space = sample_env.observation_space

        def phi(x):
            return x

        n_hidden_channels = 20
        n_hidden_layers = 1
        nonlinearity = F.leaky_relu
        if use_lstm:
            if discrete:
                model = chainerrl.links.Sequence(
                    L.LSTM(obs_space.low.size,
                           n_hidden_channels,
                           forget_bias_init=1),
                    policies.FCSoftmaxPolicy(
                        n_hidden_channels,
                        action_space.n,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=n_hidden_layers,
                        nonlinearity=nonlinearity),
                )
            else:
                model = chainerrl.links.Sequence(
                    L.LSTM(obs_space.low.size,
                           n_hidden_channels,
                           forget_bias_init=1),
                    policies.FCGaussianPolicy(
                        n_hidden_channels,
                        action_space.low.size,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=n_hidden_layers,
                        bound_mean=True,
                        min_action=action_space.low,
                        max_action=action_space.high,
                        nonlinearity=nonlinearity,
                    ))
        else:
            if discrete:
                model = policies.FCSoftmaxPolicy(
                    obs_space.low.size,
                    action_space.n,
                    n_hidden_channels=n_hidden_channels,
                    n_hidden_layers=n_hidden_layers,
                    nonlinearity=nonlinearity)
            else:
                model = policies.FCGaussianPolicy(
                    obs_space.low.size,
                    action_space.low.size,
                    n_hidden_channels=n_hidden_channels,
                    n_hidden_layers=n_hidden_layers,
                    bound_mean=True,
                    min_action=action_space.low,
                    max_action=action_space.high,
                    nonlinearity=nonlinearity,
                )

        if gpu >= 0:
            chainer.cuda.get_device_from_id(gpu).use()
            model.to_gpu()

        opt = optimizers.Adam()
        opt.setup(model)
        beta = 1e-2
        agent = chainerrl.agents.REINFORCE(
            model,
            opt,
            beta=beta,
            phi=phi,
            batchsize=self.batchsize,
            backward_separately=self.backward_separately,
            act_deterministically=True,
        )

        chainerrl.experiments.train_agent_with_evaluation(
            agent=agent,
            env=make_env(0, False),
            eval_env=make_env(0, True),
            outdir=self.outdir,
            steps=steps,
            train_max_episode_len=2,
            eval_interval=500,
            eval_n_steps=None,
            eval_n_episodes=5,
            successful_score=1)

        # Test
        env = make_env(0, True)
        n_test_runs = 5

        for _ in range(n_test_runs):
            total_r = 0
            obs = env.reset()
            done = False
            reward = 0.0

            while not done:
                action = agent.act(obs)
                print('state:', obs, 'action:', action)
                obs, reward, done, _ = env.step(action)
                total_r += reward
            if require_success:
                self.assertAlmostEqual(total_r, 1)
            agent.stop_episode()
Beispiel #15
0
    def _test_abc(self,
                  t_max,
                  use_lstm,
                  discrete=True,
                  episodic=True,
                  steps=1000000):

        nproc = 8

        def make_env(process_idx, test):
            size = 2
            return ABC(size=size,
                       discrete=discrete,
                       episodic=episodic or test,
                       partially_observable=self.use_lstm,
                       deterministic=test)

        sample_env = make_env(0, False)
        action_space = sample_env.action_space
        obs_space = sample_env.observation_space

        def phi(x):
            return x

        n_hidden_channels = 20
        if use_lstm:
            if discrete:
                model = a3c.A3CSharedModel(
                    shared=L.LSTM(obs_space.low.size, n_hidden_channels),
                    pi=policies.FCSoftmaxPolicy(
                        n_hidden_channels,
                        action_space.n,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2),
                    v=v_function.FCVFunction(
                        n_hidden_channels,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2),
                )
            else:
                model = a3c.A3CSharedModel(
                    shared=L.LSTM(obs_space.low.size, n_hidden_channels),
                    pi=policies.FCGaussianPolicy(
                        n_hidden_channels,
                        action_space.low.size,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2,
                        bound_mean=True,
                        min_action=action_space.low,
                        max_action=action_space.high),
                    v=v_function.FCVFunction(
                        n_hidden_channels,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2),
                )
        else:
            if discrete:
                model = a3c.A3CSeparateModel(
                    pi=policies.FCSoftmaxPolicy(
                        obs_space.low.size,
                        action_space.n,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2),
                    v=v_function.FCVFunction(
                        obs_space.low.size,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2),
                )
            else:
                model = a3c.A3CSeparateModel(
                    pi=policies.FCGaussianPolicy(
                        obs_space.low.size,
                        action_space.low.size,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2,
                        bound_mean=True,
                        min_action=action_space.low,
                        max_action=action_space.high),
                    v=v_function.FCVFunction(
                        obs_space.low.size,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2),
                )
        eps = 1e-1 if discrete else 1e-2
        opt = rmsprop_async.RMSpropAsync(lr=5e-4, eps=eps, alpha=0.99)
        opt.setup(model)
        gamma = 0.9
        beta = 1e-2
        agent = a3c.A3C(model,
                        opt,
                        t_max=t_max,
                        gamma=gamma,
                        beta=beta,
                        phi=phi,
                        act_deterministically=True)

        max_episode_len = None if episodic else 2

        train_agent_async(outdir=self.outdir,
                          processes=nproc,
                          make_env=make_env,
                          agent=agent,
                          steps=steps,
                          max_episode_len=max_episode_len,
                          eval_interval=500,
                          eval_n_runs=5,
                          successful_score=1)

        # The agent returned by train_agent_async is not guaranteed to be
        # successful because parameters could be modified by other processes
        # after success. Thus here the successful model is loaded explicitly.
        agent.load(os.path.join(self.outdir, 'successful'))
        agent.stop_episode()

        # Test
        env = make_env(0, True)
        n_test_runs = 5

        for _ in range(n_test_runs):
            total_r = 0
            obs = env.reset()
            done = False
            reward = 0.0

            while not done:
                action = agent.act(obs)
                print('state:', obs, 'action:', action)
                obs, reward, done, _ = env.step(action)
                total_r += reward
            self.assertAlmostEqual(total_r, 1)
            agent.stop_episode()