예제 #1
0
파일: example.py 프로젝트: shakedhi/ASPIRE
def create_agent(train=True):
    model = ACERSeparateModel(
        pi=Sequence(
            L.Linear(env.observation_space.shape[0], 20),
            F.relu,
            L.Linear(20, env.action_space.n, initialW=LeCunNormal(1e-3)), 
            SoftmaxDistribution,
        ),
        q=Sequence(
            L.Linear(env.observation_space.shape[0], 20),
            F.relu,
            L.Linear(20, env.action_space.n, initialW=LeCunNormal(1e-3)), 
            DiscreteActionValue,
        ) 
    )

    opt = RMSpropAsync(lr=7e-4, eps=1e-2, alpha=0.99)
    opt.setup(model)
    opt.add_hook(GradientClipping(40))

    agent = ACER(
        model=model,                               # Model to train
        optimizer=opt,                             # The optimizer
        gamma=0.99,                                # Reward discount factor
        t_max=50,                                  # The model is updated after this many local steps
        replay_buffer=EpisodicReplayBuffer(5000),  # The replay buffer
        replay_start_size=100,                     # Replay buffer won't be used until it has at least this many episodes
        beta=1,                                    # Entropy regularization parameter
    )

    if not train:
        agent.act_deterministically = True
    
    return agent
예제 #2
0
    def __init__(self,
                 in_size,
                 out_size,
                 hidden_sizes,
                 nonlinearity=F.relu,
                 last_wscale=1):
        self.in_size = in_size
        self.out_size = out_size
        self.hidden_sizes = hidden_sizes
        self.nonlinearity = nonlinearity

        super().__init__()
        with self.init_scope():
            if hidden_sizes:
                hidden_layers = []
                hidden_layers.append(L.Linear(in_size, hidden_sizes[0]))
                for hin, hout in zip(hidden_sizes, hidden_sizes[1:]):
                    hidden_layers.append(L.Linear(hin, hout))
                self.hidden_layers = chainer.ChainList(*hidden_layers)
                self.output = L.Linear(hidden_sizes[-1],
                                       out_size,
                                       initialW=LeCunNormal(last_wscale))
            else:
                self.output = L.Linear(in_size,
                                       out_size,
                                       initialW=LeCunNormal(last_wscale))
예제 #3
0
    def __init__(self, in_size, out_size, hidden_sizes, normalize_input=True,
                 normalize_output=False, nonlinearity=F.relu, last_wscale=1):
        self.in_size = in_size
        self.out_size = out_size
        self.hidden_sizes = hidden_sizes
        self.normalize_input = normalize_input
        self.normalize_output = normalize_output
        self.nonlinearity = nonlinearity

        super().__init__()
        with self.init_scope():
            if normalize_input:
                self.input_bn = L.BatchNormalization(in_size)
                self.input_bn.avg_var[:] = 1

            if hidden_sizes:
                hidden_layers = []
                hidden_layers.append(LinearBN(in_size, hidden_sizes[0]))
                for hin, hout in zip(hidden_sizes, hidden_sizes[1:]):
                    hidden_layers.append(LinearBN(hin, hout))
                self.hidden_layers = chainer.ChainList(*hidden_layers)
                self.output = L.Linear(hidden_sizes[-1], out_size,
                                       initialW=LeCunNormal(last_wscale))
            else:
                self.output = L.Linear(in_size, out_size,
                                       initialW=LeCunNormal(last_wscale))

            if normalize_output:
                self.output_bn = L.BatchNormalization(out_size)
                self.output_bn.avg_var[:] = 1
예제 #4
0
    def __init__(self,
                 n_input_channels,
                 action_size,
                 var,
                 n_hidden_layers=0,
                 n_hidden_channels=None,
                 min_action=None,
                 max_action=None,
                 bound_mean=False,
                 nonlinearity=F.relu,
                 mean_wscale=1):

        self.n_input_channels = n_input_channels
        self.action_size = action_size
        self.n_hidden_layers = n_hidden_layers
        self.n_hidden_channels = n_hidden_channels
        self.min_action = min_action
        self.max_action = max_action
        self.bound_mean = bound_mean
        self.nonlinearity = nonlinearity
        if np.isscalar(var):
            self.var = np.full(action_size, var, dtype=np.float32)
        else:
            self.var = var
        layers = []
        if n_hidden_layers > 0:
            # Input to hidden
            layers.append(L.Linear(n_input_channels, n_hidden_channels))
            layers.append(self.nonlinearity)
            for _ in range(n_hidden_layers - 1):
                # Hidden to hidden
                layers.append(L.Linear(n_hidden_channels, n_hidden_channels))
                layers.append(self.nonlinearity)
            # The last layer is used to compute the mean
            layers.append(
                L.Linear(n_hidden_channels,
                         action_size,
                         initialW=LeCunNormal(mean_wscale)))
        else:
            # There's only one layer for computing the mean
            layers.append(
                L.Linear(n_input_channels,
                         action_size,
                         initialW=LeCunNormal(mean_wscale)))

        if self.bound_mean:
            layers.append(
                lambda x: bound_by_tanh(x, self.min_action, self.max_action))

        def get_var_array(shape):
            self.var = self.xp.asarray(self.var)
            return self.xp.broadcast_to(self.var, shape)

        layers.append(lambda x: distribution.GaussianDistribution(
            x, get_var_array(x.shape)))
        super().__init__(*layers)
예제 #5
0
    def __init__(self,
                 n_input_channels,
                 action_size,
                 n_hidden_layers=0,
                 n_hidden_channels=None,
                 min_action=None,
                 max_action=None,
                 bound_mean=False,
                 var_type='spherical',
                 nonlinearity=F.relu,
                 mean_wscale=1,
                 var_wscale=1,
                 var_bias=0,
                 min_var=0):

        self.n_input_channels = n_input_channels
        self.action_size = action_size
        self.n_hidden_layers = n_hidden_layers
        self.n_hidden_channels = n_hidden_channels
        self.min_action = min_action
        self.max_action = max_action
        self.bound_mean = bound_mean
        self.nonlinearity = nonlinearity
        self.min_var = min_var
        var_size = {'spherical': 1, 'diagonal': action_size}[var_type]

        self.hidden_layers = []
        if n_hidden_layers > 0:
            self.hidden_layers.append(
                L.Linear(n_input_channels, n_hidden_channels))
            for _ in range(n_hidden_layers - 1):
                self.hidden_layers.append(
                    L.Linear(n_hidden_channels, n_hidden_channels))
            self.mean_layer = L.Linear(n_hidden_channels,
                                       action_size,
                                       initialW=LeCunNormal(mean_wscale))
            self.var_layer = L.Linear(n_hidden_channels,
                                      var_size,
                                      initialW=LeCunNormal(var_wscale),
                                      initial_bias=var_bias)
        else:
            self.mean_layer = L.Linear(n_input_channels,
                                       action_size,
                                       initialW=LeCunNormal(mean_wscale))
            self.var_layer = L.Linear(n_input_channels,
                                      var_size,
                                      initialW=LeCunNormal(var_wscale),
                                      initial_bias=var_bias)

        super().__init__(self.mean_layer, self.var_layer, *self.hidden_layers)
예제 #6
0
    def __init__(
        self,
        n_input_channels,
        action_size,
        n_hidden_layers=0,
        n_hidden_channels=None,
        min_action=None,
        max_action=None,
        bound_mean=False,
        var_type='spherical',
        nonlinearity=F.relu,
        mean_wscale=1,
        var_func=F.softplus,
        var_param_init=0,
    ):

        self.n_input_channels = n_input_channels
        self.action_size = action_size
        self.n_hidden_layers = n_hidden_layers
        self.n_hidden_channels = n_hidden_channels
        self.min_action = min_action
        self.max_action = max_action
        self.bound_mean = bound_mean
        self.nonlinearity = nonlinearity
        self.var_func = var_func
        var_size = {'spherical': 1, 'diagonal': action_size}[var_type]

        layers = []
        layers.append(L.Linear(n_input_channels, n_hidden_channels))
        for _ in range(n_hidden_layers - 1):
            layers.append(self.nonlinearity)
            layers.append(L.Linear(n_hidden_channels, n_hidden_channels))
        layers.append(self.nonlinearity)
        # The last layer is used to compute the mean
        layers.append(
            L.Linear(n_hidden_channels,
                     action_size,
                     initialW=LeCunNormal(mean_wscale)))

        if self.bound_mean:
            layers.append(
                lambda x: bound_by_tanh(x, self.min_action, self.max_action))

        super().__init__()
        with self.init_scope():
            self.hidden_layers = links.Sequence(*layers)
            self.var_param = chainer.Parameter(initializer=var_param_init,
                                               shape=(var_size, ))
예제 #7
0
    def __init__(self,
                 n_input_channels,
                 n_hidden_layers,
                 n_hidden_channels,
                 action_size,
                 min_action=None,
                 max_action=None,
                 bound_action=True,
                 nonlinearity=F.relu,
                 last_wscale=1.):
        self.n_input_channels = n_input_channels
        self.n_hidden_layers = n_hidden_layers
        self.n_hidden_channels = n_hidden_channels
        self.action_size = action_size
        self.min_action = min_action
        self.max_action = max_action
        self.bound_action = bound_action

        if self.bound_action:

            def action_filter(x):
                return bound_by_tanh(x, self.min_action, self.max_action)
        else:
            action_filter = None

        model = chainer.Chain(
            fc=MLP(
                self.n_input_channels,
                n_hidden_channels,
                (self.n_hidden_channels, ) * self.n_hidden_layers,
                nonlinearity=nonlinearity,
            ),
            lstm=L.LSTM(n_hidden_channels, n_hidden_channels),
            out=L.Linear(n_hidden_channels,
                         action_size,
                         initialW=LeCunNormal(last_wscale)),
        )

        def model_call(model, x):
            h = nonlinearity(model.fc(x))
            h = model.lstm(h)
            h = model.out(h)
            return h

        super().__init__(model=model,
                         model_call=model_call,
                         action_filter=action_filter)
예제 #8
0
def get_weight_initializer(name: Optional[str]):
    if name is None:
        return None
    elif name == "GlorotNormal":
        from chainer.initializers.normal import GlorotNormal

        initializer = GlorotNormal()
    elif name == "HeNormal":
        from chainer.initializers.normal import HeNormal

        initializer = HeNormal()
    elif name == "LeCunNormal":
        from chainer.initializers.normal import LeCunNormal

        initializer = LeCunNormal()
    elif name == "Normal":
        from chainer.initializers.normal import Normal

        initializer = Normal()
    elif name == "Orthogonal":
        from chainer.initializers.orthogonal import Orthogonal

        initializer = Orthogonal()
    elif name == "GlorotUniform":
        from chainer.initializers.uniform import GlorotUniform

        initializer = GlorotUniform()
    elif name == "HeUniform":
        from chainer.initializers.uniform import HeUniform

        initializer = HeUniform()
    elif name == "LeCunUniform":
        from chainer.initializers.uniform import LeCunUniform

        initializer = LeCunUniform()
    elif name == "Uniform":
        from chainer.initializers.uniform import Uniform

        initializer = Uniform()
    elif name == "PossibleOrthogonal":
        initializer = PossibleOrthogonal()
    else:
        raise ValueError(name)
    return initializer
예제 #9
0
 def __init__(self,
              n_dim_obs,
              n_dim_action,
              n_hidden_channels,
              n_hidden_layers,
              nonlinearity=F.relu,
              last_wscale=1.):
     self.n_input_channels = n_dim_obs + n_dim_action
     self.n_hidden_layers = n_hidden_layers
     self.n_hidden_channels = n_hidden_channels
     self.nonlinearity = nonlinearity
     super().__init__()
     with self.init_scope():
         self.fc = MLP(
             self.n_input_channels,
             n_hidden_channels,
             [self.n_hidden_channels] * self.n_hidden_layers,
             nonlinearity=nonlinearity,
         )
         self.lstm = L.LSTM(n_hidden_channels, n_hidden_channels)
         self.out = L.Linear(n_hidden_channels,
                             1,
                             initialW=LeCunNormal(last_wscale))
예제 #10
0
def main():
    import logging

    parser = argparse.ArgumentParser()
    parser.add_argument('processes', type=int)
    parser.add_argument('--env', type=str, default='CartPole-v0')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 32)')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--t-max', type=int, default=50)
    parser.add_argument('--n-times-replay', type=int, default=4)
    parser.add_argument('--n-hidden-channels', type=int, default=100)
    parser.add_argument('--n-hidden-layers', type=int, default=2)
    parser.add_argument('--replay-capacity', type=int, default=5000)
    parser.add_argument('--replay-start-size', type=int, default=10**3)
    parser.add_argument('--disable-online-update', action='store_true')
    parser.add_argument('--beta', type=float, default=1e-2)
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--steps', type=int, default=8 * 10**7)
    parser.add_argument('--eval-interval', type=int, default=10**5)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--reward-scale-factor', type=float, default=1e-2)
    parser.add_argument('--rmsprop-epsilon', type=float, default=1e-2)
    parser.add_argument('--render', action='store_true', default=False)
    parser.add_argument('--lr', type=float, default=7e-4)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--logger-level', type=int, default=logging.DEBUG)
    parser.add_argument('--monitor', action='store_true')
    parser.add_argument('--truncation-threshold', type=float, default=5)
    parser.add_argument('--trust-region-delta', type=float, default=0.1)
    args = parser.parse_args()

    logging.basicConfig(level=args.logger_level)

    # Set a random seed used in ChainerRL.
    # If you use more than one processes, the results will be no longer
    # deterministic even with the same random seed.
    misc.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.processes) + args.seed * args.processes
    assert process_seeds.max() < 2**32

    args.outdir = experiments.prepare_output_dir(args, args.outdir)

    def make_env(process_idx, test):
        env = gym.make(args.env)
        # Use different random seeds for train and test envs
        process_seed = int(process_seeds[process_idx])
        env_seed = 2**32 - 1 - process_seed if test else process_seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if args.monitor and process_idx == 0:
            env = chainerrl.wrappers.Monitor(env, args.outdir)
        if not test:
            # Scale rewards (and thus returns) to a reasonable range so that
            # training is easier
            env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor)
        if args.render and process_idx == 0 and not test:
            env = chainerrl.wrappers.Render(env)
        return env

    sample_env = gym.make(args.env)
    timestep_limit = sample_env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = sample_env.observation_space
    action_space = sample_env.action_space

    if isinstance(action_space, spaces.Box):
        model = acer.ACERSDNSeparateModel(
            pi=policies.FCGaussianPolicy(
                obs_space.low.size,
                action_space.low.size,
                n_hidden_channels=args.n_hidden_channels,
                n_hidden_layers=args.n_hidden_layers,
                bound_mean=True,
                min_action=action_space.low,
                max_action=action_space.high),
            v=v_functions.FCVFunction(obs_space.low.size,
                                      n_hidden_channels=args.n_hidden_channels,
                                      n_hidden_layers=args.n_hidden_layers),
            adv=q_functions.FCSAQFunction(
                obs_space.low.size,
                action_space.low.size,
                n_hidden_channels=args.n_hidden_channels // 4,
                n_hidden_layers=args.n_hidden_layers),
        )
    else:
        model = acer.ACERSeparateModel(
            pi=links.Sequence(
                L.Linear(obs_space.low.size, args.n_hidden_channels), F.relu,
                L.Linear(args.n_hidden_channels,
                         action_space.n,
                         initialW=LeCunNormal(1e-3)), SoftmaxDistribution),
            q=links.Sequence(
                L.Linear(obs_space.low.size, args.n_hidden_channels), F.relu,
                L.Linear(args.n_hidden_channels,
                         action_space.n,
                         initialW=LeCunNormal(1e-3)), DiscreteActionValue),
        )

    opt = rmsprop_async.RMSpropAsync(lr=args.lr,
                                     eps=args.rmsprop_epsilon,
                                     alpha=0.99)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))

    replay_buffer = EpisodicReplayBuffer(args.replay_capacity)
    agent = acer.ACER(model,
                      opt,
                      t_max=args.t_max,
                      gamma=0.99,
                      replay_buffer=replay_buffer,
                      n_times_replay=args.n_times_replay,
                      replay_start_size=args.replay_start_size,
                      disable_online_update=args.disable_online_update,
                      use_trust_region=True,
                      trust_region_delta=args.trust_region_delta,
                      truncation_threshold=args.truncation_threshold,
                      beta=args.beta)
    if args.load:
        agent.load(args.load)

    if args.demo:
        env = make_env(0, True)
        eval_stats = experiments.eval_performance(
            env=env,
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_async(agent=agent,
                                      outdir=args.outdir,
                                      processes=args.processes,
                                      make_env=make_env,
                                      profile=args.profile,
                                      steps=args.steps,
                                      eval_n_steps=None,
                                      eval_n_episodes=args.eval_n_runs,
                                      eval_interval=args.eval_interval,
                                      max_episode_len=timestep_limit)
예제 #11
0
    def __init__(self):
        super().__init__()

        self.orthogonal = Orthogonal()
        self.lecun = LeCunNormal()
예제 #12
0
def main(args):
    import logging
    logging.basicConfig(level=logging.INFO, filename='log')

    if (type(args) is list):
        args = make_args(args)

    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)

    # Set a random seed used in ChainerRL.
    # If you use more than one processes, the results will be no longer
    # deterministic even with the same random seed.
    misc.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.processes) + args.seed * args.processes
    assert process_seeds.max() < 2**32

    def make_env(process_idx, test):
        env = gym.make(args.env)
        # Use different random seeds for train and test envs
        process_seed = int(process_seeds[process_idx])
        env_seed = 2**32 - 1 - process_seed if test else process_seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if args.monitor and process_idx == 0:
            env = chainerrl.wrappers.Monitor(env, args.outdir)
        if not test:
            # Scale rewards (and thus returns) to a reasonable range so that
            # training is easier
            env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor)
        if args.render and process_idx == 0 and not test:
            env = chainerrl.wrappers.Render(env)
        return env

    sample_env = gym.make(args.env)
    timestep_limit = sample_env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = sample_env.observation_space
    action_space = sample_env.action_space

    if isinstance(action_space, spaces.Box):
        model = acer.ACERSDNSeparateModel(
            pi=policies.FCGaussianPolicy(
                obs_space.low.size,
                action_space.low.size,
                n_hidden_channels=args.n_hidden_channels,
                n_hidden_layers=args.n_hidden_layers,
                bound_mean=True,
                min_action=action_space.low,
                max_action=action_space.high),
            v=v_functions.FCVFunction(obs_space.low.size,
                                      n_hidden_channels=args.n_hidden_channels,
                                      n_hidden_layers=args.n_hidden_layers),
            adv=q_functions.FCSAQFunction(
                obs_space.low.size,
                action_space.low.size,
                n_hidden_channels=args.n_hidden_channels // 4,
                n_hidden_layers=args.n_hidden_layers),
        )
    else:
        model = acer.ACERSeparateModel(
            pi=links.Sequence(
                L.Linear(obs_space.low.size, args.n_hidden_channels), F.relu,
                L.Linear(args.n_hidden_channels,
                         action_space.n,
                         initialW=LeCunNormal(1e-3)), SoftmaxDistribution),
            q=links.Sequence(
                L.Linear(obs_space.low.size, args.n_hidden_channels), F.relu,
                L.Linear(args.n_hidden_channels,
                         action_space.n,
                         initialW=LeCunNormal(1e-3)), DiscreteActionValue),
        )

    opt = rmsprop_async.RMSpropAsync(lr=args.lr,
                                     eps=args.rmsprop_epsilon,
                                     alpha=0.99)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))

    replay_buffer = EpisodicReplayBuffer(args.replay_capacity)
    agent = acer.ACER(model,
                      opt,
                      t_max=args.t_max,
                      gamma=0.99,
                      replay_buffer=replay_buffer,
                      n_times_replay=args.n_times_replay,
                      replay_start_size=args.replay_start_size,
                      disable_online_update=args.disable_online_update,
                      use_trust_region=True,
                      trust_region_delta=args.trust_region_delta,
                      truncation_threshold=args.truncation_threshold,
                      beta=args.beta)

    if args.load_agent:
        agent.load(args.load_agent)

    if (args.mode == 'train'):
        experiments.train_agent_async(agent=agent,
                                      outdir=args.outdir,
                                      processes=args.processes,
                                      make_env=make_env,
                                      profile=args.profile,
                                      steps=args.steps,
                                      step_offset=args.step_offset,
                                      checkpoint_freq=args.checkpoint_freq,
                                      log_type=args.log_type,
                                      eval_n_steps=None,
                                      eval_n_episodes=args.eval_n_runs,
                                      eval_interval=args.eval_interval,
                                      max_episode_len=timestep_limit)

    elif (args.mode == 'check'):
        from matplotlib import animation
        import matplotlib.pyplot as plt
        env = make_env(0, True)

        frames = []
        for i in range(3):
            obs = env.reset()
            done = False
            R = 0
            t = 0
            while not done and t < 200:
                frames.append(env.render(mode='rgb_array'))
                action = agent.act(obs)
                obs, r, done, _ = env.step(action)
                R += r
                t += 1
            print('test episode:', i, 'R:', R)
            agent.stop_episode()
        env.close()

        from IPython.display import HTML
        plt.figure(figsize=(frames[0].shape[1] / 72.0,
                            frames[0].shape[0] / 72.0),
                   dpi=72)
        patch = plt.imshow(frames[0])
        plt.axis('off')

        def animate(i):
            patch.set_data(frames[i])

        anim = animation.FuncAnimation(plt.gcf(),
                                       animate,
                                       frames=len(frames),
                                       interval=50)
        anim.save(args.save_mp4)
        return anim