コード例 #1
0
def parse_arch(arch, n_actions):
    if arch == 'nature':
        return links.Sequence(links.NatureDQNHead(n_input_channels=3),
                              L.Linear(512, n_actions), DiscreteActionValue)

    elif arch == 'doubledqn':

        class SingleSharedBias(chainer.Chain):
            """Single shared bias used in the Double DQN paper.
            You can add this link after a Linear layer with nobias=True to implement a
            Linear layer with a single shared bias parameter.
            See http://arxiv.org/abs/1509.06461.
            """
            def __init__(self):
                super().__init__()
                with self.init_scope():
                    self.bias = chainer.Parameter(0, shape=1)

            def __call__(self, x):
                return x + F.broadcast_to(self.bias, x.shape)

        return links.Sequence(links.NatureDQNHead(n_input_channels=3),
                              L.Linear(512, n_actions, nobias=True),
                              SingleSharedBias(), DiscreteActionValue)

    elif arch == 'nips':
        return links.Sequence(links.NIPSDQNHead(n_input_channels=3),
                              L.Linear(256, n_actions), DiscreteActionValue)

    elif arch == 'dueling':
        return DuelingDQN(n_actions, n_input_channels=3)
    else:
        raise RuntimeError('Not supported architecture: {}'.format(arch))
コード例 #2
0
    def _test_load_iqn(self, gpu):
        q_func = agents.iqn.ImplicitQuantileQFunction(
            psi=links.Sequence(
                L.Convolution2D(None, 32, 8, stride=4),
                F.relu,
                L.Convolution2D(None, 64, 4, stride=2),
                F.relu,
                L.Convolution2D(None, 64, 3, stride=1),
                F.relu,
                functools.partial(F.reshape, shape=(-1, 3136)),
            ),
            phi=links.Sequence(
                agents.iqn.CosineBasisLinear(64, 3136),
                F.relu,
            ),
            f=links.Sequence(
                L.Linear(None, 512),
                F.relu,
                L.Linear(None, 4),
            ),
        )

        opt = chainer.optimizers.Adam(5e-5, eps=1e-2)
        opt.setup(q_func)

        rbuf = replay_buffer.ReplayBuffer(100)

        explorer = explorers.LinearDecayEpsilonGreedy(
            start_epsilon=1.0,
            end_epsilon=0.1,
            decay_steps=10**6,
            random_action_func=lambda: np.random.randint(4))

        agent = agents.IQN(
            q_func,
            opt,
            rbuf,
            gpu=gpu,
            gamma=0.99,
            explorer=explorer,
            replay_start_size=50,
            target_update_interval=10**4,
            update_interval=4,
            batch_accumulator='mean',
            phi=lambda x: x,
            quantile_thresholds_N=64,
            quantile_thresholds_N_prime=64,
            quantile_thresholds_K=32,
        )

        model, exists = download_model("IQN",
                                       "BreakoutNoFrameskip-v4",
                                       model_type=self.pretrained_type)
        agent.load(model)
        if os.environ.get('CHAINERRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED'):
            assert exists
コード例 #3
0
ファイル: train_dqn_ale.py プロジェクト: uidilr/chainerrl
def parse_arch(arch, n_actions, activation):
    if arch == 'nature':
        return links.Sequence(links.NatureDQNHead(activation=activation),
                              L.Linear(512, n_actions), DiscreteActionValue)
    elif arch == 'nips':
        return links.Sequence(links.NIPSDQNHead(activation=activation),
                              L.Linear(256, n_actions), DiscreteActionValue)
    elif arch == 'dueling':
        return DuelingDQN(n_actions)
    else:
        raise RuntimeError('Not supported architecture: {}'.format(arch))
コード例 #4
0
ファイル: acer.py プロジェクト: leferrad/powrl3
    def __init__(self,
                 env,
                 feature_transformer,
                 gamma=0.99,
                 optimizer='adam',
                 max_memory=10000):
        BaseAgent.__init__(self,
                           env=env,
                           feature_transformer=feature_transformer,
                           gamma=gamma,
                           optimizer=optimizer)

        self.model = acer.ACERSharedModel(
            shared=links.Sequence(
                L.ConvolutionND(ndim=1,
                                in_channels=self.n_dims,
                                out_channels=100,
                                ksize=3,
                                stride=1,
                                pad=1,
                                cover_all=True), L.Linear(100, 100), F.relu),
            pi=links.Sequence(L.Linear(100, self.n_actions), F.relu,
                              SoftmaxDistribution),
            q=links.Sequence(L.Linear(100, self.n_actions), F.relu,
                             DiscreteActionValue))

        self.optimizer.setup(self.model)
        #self.optimizer.add_hook(chainer.optimizer.GradientClipping(40))

        self.replay_buffer = PrioritizedEpisodicReplayBuffer(
            capacity=max_memory,
            uniform_ratio=0.1,
            default_priority_func=exp_return_of_episode,
            wait_priority_after_sampling=False,
            return_sample_weights=False)

        self.agent = acer.ACER(model=self.model,
                               optimizer=self.optimizer,
                               gamma=self.gamma,
                               replay_buffer=self.replay_buffer,
                               phi=self.phi,
                               n_times_replay=2,
                               t_max=200,
                               replay_start_size=50,
                               disable_online_update=False,
                               use_trust_region=False,
                               use_Q_opc=True,
                               trust_region_delta=0.1,
                               truncation_threshold=None,
                               beta=1e-2)
コード例 #5
0
def parse_arch(arch, n_actions):
    if arch == 'nature':
        return links.Sequence(links.NatureDQNHead(), L.Linear(512, n_actions),
                              DiscreteActionValue)
    elif arch == 'doubledqn':
        return links.Sequence(links.NatureDQNHead(),
                              L.Linear(512, n_actions, nobias=True),
                              SingleSharedBias(), DiscreteActionValue)
    elif arch == 'nips':
        return links.Sequence(links.NIPSDQNHead(), L.Linear(256, n_actions),
                              DiscreteActionValue)
    elif arch == 'dueling':
        return DuelingDQN(n_actions)
    else:
        raise RuntimeError('Not supported architecture: {}'.format(arch))
コード例 #6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model',
                        type=str,
                        required=True,
                        help='Model directory path.')
    parser.add_argument('--out',
                        type=str,
                        required=True,
                        help='ONNX file output path.')
    parser.add_argument('--gpu', type=int, default=0, help='GPU id.')
    args = parser.parse_args()

    # Predefined parameters.
    n_actions = 4  # env.action_space.n
    replay_start_size = 5 * 10**4

    # Load the model.
    q_func = links.Sequence(links.NatureDQNHead(), L.Linear(512, n_actions),
                            DiscreteActionValue)
    opt = chainer.optimizers.RMSpropGraves(lr=2.5e-4,
                                           alpha=0.95,
                                           momentum=0.0,
                                           eps=1e-2)
    opt.setup(q_func)
    rbuf = replay_buffer.ReplayBuffer(10**6)
    explorer = explorers.LinearDecayEpsilonGreedy(
        start_epsilon=1.0,
        end_epsilon=0.1,
        decay_steps=10**6,
        random_action_func=lambda: np.random.randint(n_actions))

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    Agent = agents.DQN
    agent = Agent(q_func,
                  opt,
                  rbuf,
                  gpu=args.gpu,
                  gamma=0.99,
                  explorer=explorer,
                  replay_start_size=replay_start_size,
                  target_update_interval=10**4,
                  clip_delta=True,
                  update_interval=4,
                  batch_accumulator='sum',
                  phi=phi)
    agent.load(args.model)

    # Extract core links from the model and export these links as an ONNX format.
    onnx_compat_model = convert_to_compatible_model(agent)
    x = cp.array(np.zeros((1, 4, 84, 84), dtype=np.float32))
    onnx_chainer.export(onnx_compat_model,
                        x,
                        input_names='input',
                        output_names='action',
                        return_named_inout=True,
                        filename=args.out)
コード例 #7
0
    def __init__(self, env, feature_transformer, gamma=0.99, optimizer='adam', max_memory=10000):
        BaseAgent.__init__(self, env=env, feature_transformer=feature_transformer, gamma=gamma,
                           optimizer=optimizer)

        self.model = links.Sequence(L.ConvolutionND(ndim=1, in_channels=self.n_dims,
                                                    out_channels=100, ksize=3,
                                                    stride=1, pad=1, cover_all=True),
                                    FCStateQFunctionWithDiscreteAction(ndim_obs=100, n_actions=self.n_actions,
                                                                       n_hidden_channels=100, n_hidden_layers=2)
                                    )

        self.optimizer.setup(self.model)
        #self.optimizer.add_hook(chainer.optimizer.GradientClipping(40))

        self.replay_buffer = \
            chainerrl.replay_buffer.PrioritizedEpisodicReplayBuffer(
                capacity=max_memory,
                uniform_ratio=0.1,
                default_priority_func=exp_return_of_episode,
                wait_priority_after_sampling=False,
                return_sample_weights=False)

        self.agent = DoubleDQN(q_function=self.model, optimizer=self.optimizer, replay_buffer=self.replay_buffer,
                               explorer=self.explorer, gamma=self.gamma, phi=phi,
                               update_interval=500, minibatch_size=50)
コード例 #8
0
def create_acer_agent(env):
    #our observation space dimension of malware
    obs_dim = env.observation_space.shape[0]
    #the list of actions that we can perform on the malware
    n_actions = env.action_space.n
    #our acer network
    #consists of pi (our policy) and our q (our q function)
    model = acer.ACERSeparateModel(
        pi=links.Sequence(L.Linear(obs_dim, 1024,
                                   initialW=LeCunNormal(1e-3)), F.relu,
                          L.Linear(1024, 512,
                                   initialW=LeCunNormal(1e-3)), F.relu,
                          L.Linear(512, n_actions, initialW=LeCunNormal(1e-3)),
                          SoftmaxDistribution),
        q=links.Sequence(L.Linear(obs_dim, 1024, initialW=LeCunNormal(1e-3)),
                         F.relu, L.Linear(1024,
                                          512,
                                          initialW=LeCunNormal(1e-3)), F.relu,
                         L.Linear(512, n_actions, initialW=LeCunNormal(1e-3)),
                         DiscreteActionValue),
    )
    #optimizer for the acer
    opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-2, alpha=0.99)
    opt.setup(model)
    #hook to the chainer model
    opt.add_hook(chainer.optimizer.GradientClipping(40))

    replay_buffer = EpisodicReplayBuffer(128)
    #the agent itself, params from original file
    agent = acer.ACER(
        model,
        opt,
        gamma=0.95,  # reward discount factor
        t_max=32,  # update the model after this many local steps
        replay_buffer=replay_buffer,
        n_times_replay=
        4,  # number of times experience replay is repeated for each update
        replay_start_size=
        64,  # don't start replay unless we have this many experiences in the buffer
        disable_online_update=True,  # rely only on experience buffer
        use_trust_region=True,  # enable trust region policy optimiztion
        trust_region_delta=0.1,  # a parameter for TRPO
        truncation_threshold=5.0,  # truncate large importance weights
        beta=1e-2,  # entropy regularization parameter
        phi=lambda obs: obs.astype(np.float32, copy=False))

    return agent
コード例 #9
0
    def __init__(
        self,
        n_input_channels,
        action_size,
        n_hidden_layers=0,
        n_hidden_channels=None,
        min_action=None,
        max_action=None,
        bound_mean=False,
        var_type='spherical',
        nonlinearity=F.relu,
        mean_wscale=1,
        var_func=F.softplus,
        var_param_init=0,
    ):

        self.n_input_channels = n_input_channels
        self.action_size = action_size
        self.n_hidden_layers = n_hidden_layers
        self.n_hidden_channels = n_hidden_channels
        self.min_action = min_action
        self.max_action = max_action
        self.bound_mean = bound_mean
        self.nonlinearity = nonlinearity
        self.var_func = var_func
        var_size = {'spherical': 1, 'diagonal': action_size}[var_type]

        layers = []
        layers.append(L.Linear(n_input_channels, n_hidden_channels))
        for _ in range(n_hidden_layers - 1):
            layers.append(self.nonlinearity)
            layers.append(L.Linear(n_hidden_channels, n_hidden_channels))
        layers.append(self.nonlinearity)
        # The last layer is used to compute the mean
        layers.append(
            L.Linear(n_hidden_channels,
                     action_size,
                     initialW=LeCunNormal(mean_wscale)))

        if self.bound_mean:
            layers.append(
                lambda x: bound_by_tanh(x, self.min_action, self.max_action))

        super().__init__()
        with self.init_scope():
            self.hidden_layers = links.Sequence(*layers)
            self.var_param = chainer.Parameter(initializer=var_param_init,
                                               shape=(var_size, ))
コード例 #10
0
ファイル: ppo.py プロジェクト: leferrad/powrl3
    def __init__(self, n_dims, n_actions):
        self.head = links.Sequence(
            L.ConvolutionND(ndim=1,
                            in_channels=n_dims,
                            out_channels=100,
                            ksize=3,
                            stride=1,
                            pad=1,
                            cover_all=True), F.relu)
        self.pi = policies.FCSoftmaxPolicy(n_input_channels=100,
                                           n_actions=n_actions,
                                           n_hidden_layers=2,
                                           n_hidden_channels=100)
        self.v = v_functions.FCVFunction(n_input_channels=100,
                                         n_hidden_layers=2,
                                         n_hidden_channels=100)

        super(A3CFF, self).__init__(self.head, self.pi, self.v)
コード例 #11
0
    def __init__(self, modelpath, n_actions=4, n_stack_frames=4):
        # Predefined parameters.
        replay_start_size = 5 * 10**4

        # Load the model.
        q_func = links.Sequence(links.NatureDQNHead(),
                                L.Linear(512, n_actions), DiscreteActionValue)
        opt = chainer.optimizers.RMSpropGraves(lr=2.5e-4,
                                               alpha=0.95,
                                               momentum=0.0,
                                               eps=1e-2)
        opt.setup(q_func)
        rbuf = replay_buffer.ReplayBuffer(10**6)
        explorer = explorers.LinearDecayEpsilonGreedy(
            start_epsilon=1.0,
            end_epsilon=0.1,
            decay_steps=10**6,
            random_action_func=lambda: np.random.randint(n_actions))

        def phi(x):
            # Feature extractor
            return np.asarray(x, dtype=np.float32) / 255

        Agent = agents.DQN
        self._agent = Agent(q_func,
                            opt,
                            rbuf,
                            gpu=-1,
                            gamma=0.99,
                            explorer=explorer,
                            replay_start_size=replay_start_size,
                            target_update_interval=10**4,
                            clip_delta=True,
                            update_interval=4,
                            batch_accumulator='sum',
                            phi=phi)
        self._agent.load(modelpath)

        self._state = deque([], maxlen=n_stack_frames)
        self._action = 0
コード例 #12
0
    def _test_load_dqn(self, gpu):
        q_func = links.Sequence(links.NatureDQNHead(), L.Linear(512, 4),
                                DiscreteActionValue)

        opt = optimizers.RMSpropGraves(lr=2.5e-4,
                                       alpha=0.95,
                                       momentum=0.0,
                                       eps=1e-2)
        opt.setup(q_func)

        rbuf = replay_buffer.ReplayBuffer(100)

        explorer = explorers.LinearDecayEpsilonGreedy(
            start_epsilon=1.0,
            end_epsilon=0.1,
            decay_steps=10**6,
            random_action_func=lambda: np.random.randint(4))

        agent = agents.DQN(q_func,
                           opt,
                           rbuf,
                           gpu=gpu,
                           gamma=0.99,
                           explorer=explorer,
                           replay_start_size=50,
                           target_update_interval=10**4,
                           clip_delta=True,
                           update_interval=4,
                           batch_accumulator='sum',
                           phi=lambda x: x)

        model, exists = download_model("DQN",
                                       "BreakoutNoFrameskip-v4",
                                       model_type=self.pretrained_type)
        agent.load(model)
        if os.environ.get('CHAINERRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED'):
            assert exists
コード例 #13
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env',
                        type=str,
                        default='BreakoutNoFrameskip-v4',
                        help='OpenAI Atari domain to perform algorithm on.')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--gpu',
                        type=int,
                        default=0,
                        help='GPU to use, set to -1 if no GPU.')
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--logging-level',
                        type=int,
                        default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--render',
                        action='store_true',
                        default=False,
                        help='Render env states in a GUI window.')
    parser.add_argument('--monitor',
                        action='store_true',
                        default=False,
                        help='Monitor env. Videos and additional information'
                        ' are saved as output files.')
    parser.add_argument('--steps',
                        type=int,
                        default=5 * 10**7,
                        help='Total number of timesteps to train the agent.')
    parser.add_argument('--replay-start-size',
                        type=int,
                        default=5 * 10**4,
                        help='Minimum replay buffer size before ' +
                        'performing gradient updates.')
    parser.add_argument('--eval-n-steps', type=int, default=125000)
    parser.add_argument('--eval-interval', type=int, default=250000)
    parser.add_argument('--n-best-episodes', type=int, default=30)
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    # Set different random seeds for train and test envs.
    train_seed = args.seed
    test_seed = 2**31 - 1 - args.seed

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    def make_env(test):
        # Use different random seeds for train and test envs
        env_seed = test_seed if test else train_seed
        env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(
            args.env, max_frames=None),
                                           episode_life=not test,
                                           clip_rewards=not test)
        env.seed(int(env_seed))
        if test:
            # Randomize actions like epsilon-greedy in evaluation as well
            env = chainerrl.wrappers.RandomizeAction(env, 0.05)
        if args.monitor:
            env = chainerrl.wrappers.Monitor(
                env, args.outdir, mode='evaluation' if test else 'training')
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    eval_env = make_env(test=True)

    n_actions = env.action_space.n
    q_func = links.Sequence(links.NatureDQNHead(), L.Linear(512, n_actions),
                            DiscreteActionValue)

    # Draw the computational graph and save it in the output directory.
    chainerrl.misc.draw_computational_graph(
        [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])],
        os.path.join(args.outdir, 'model'))

    # Use the same hyperparameters as the Nature paper
    opt = optimizers.RMSpropGraves(lr=2.5e-4,
                                   alpha=0.95,
                                   momentum=0.0,
                                   eps=1e-2)

    opt.setup(q_func)

    rbuf = replay_buffer.ReplayBuffer(10**6)

    explorer = explorers.LinearDecayEpsilonGreedy(
        start_epsilon=1.0,
        end_epsilon=0.1,
        decay_steps=10**6,
        random_action_func=lambda: np.random.randint(n_actions))

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    Agent = agents.DQN
    agent = Agent(q_func,
                  opt,
                  rbuf,
                  gpu=args.gpu,
                  gamma=0.99,
                  explorer=explorer,
                  replay_start_size=args.replay_start_size,
                  target_update_interval=10**4,
                  clip_delta=True,
                  update_interval=4,
                  batch_accumulator='sum',
                  phi=phi)

    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(env=eval_env,
                                                  agent=agent,
                                                  n_steps=args.eval_n_steps,
                                                  n_episodes=None)
        print('n_episodes: {} mean: {} median: {} stdev {}'.format(
            eval_stats['episodes'], eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_steps=args.eval_n_steps,
            eval_n_episodes=None,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            save_best_so_far_agent=True,
            eval_env=eval_env,
        )

        dir_of_best_network = os.path.join(args.outdir, "best")
        agent.load(dir_of_best_network)

        # run 30 evaluation episodes, each capped at 5 mins of play
        stats = experiments.evaluator.eval_performance(
            env=eval_env,
            agent=agent,
            n_steps=None,
            n_episodes=args.n_best_episodes,
            max_episode_len=4500,
            logger=None)
        with open(os.path.join(args.outdir, 'bestscores.json'), 'w') as f:
            json.dump(stats, f)
        print("The results of the best scoring network:")
        for stat in stats:
            print(str(stat) + ":" + str(stats[stat]))
コード例 #14
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env',
                        type=str,
                        default='BreakoutNoFrameskip-v4',
                        help='OpenAI Atari domain to perform algorithm on.')
    parser.add_argument('--out_dir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--gpu',
                        type=int,
                        default=0,
                        help='GPU to use, set to -1 if no GPU.')
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--final-exploration-frames',
                        type=int,
                        default=10**5,
                        help='Timesteps after which we stop ' +
                        'annealing exploration rate')
    parser.add_argument('--final-epsilon',
                        type=float,
                        default=0.1,
                        help='Final value of epsilon during training.')
    parser.add_argument('--eval-epsilon',
                        type=float,
                        default=0.05,
                        help='Exploration epsilon used during eval episodes.')
    parser.add_argument('--arch',
                        type=str,
                        default='doubledqn',
                        choices=['nature', 'nips', 'dueling', 'doubledqn'],
                        help='Network architecture to use.')
    parser.add_argument('--steps',
                        type=int,
                        default=10**6,
                        help='Total number of timesteps to train the agent.')
    parser.add_argument(
        '--max-episode-len',
        type=int,
        default=30 * 60 * 60 // 4,  # 30 minutes with 60/4 fps
        help='Maximum number of timesteps for each episode.')
    parser.add_argument('--replay-start-size',
                        type=int,
                        default=1000,
                        help='Minimum replay buffer size before ' +
                        'performing gradient updates.')
    parser.add_argument('--target-update-interval',
                        type=int,
                        default=1 * 10**4,
                        help='Frequency (in timesteps) at which ' +
                        'the target network is updated.')
    parser.add_argument('--eval-interval',
                        type=int,
                        default=10**5,
                        help='Frequency (in timesteps) of evaluation phase.')
    parser.add_argument('--update-interval',
                        type=int,
                        default=4,
                        help='Frequency (in timesteps) of network updates.')
    parser.add_argument('--eval-n-runs', type=int, default=100)
    parser.add_argument('--logging-level',
                        type=int,
                        default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--render',
                        action='store_true',
                        default=False,
                        help='Render env states in a GUI window.')
    parser.add_argument('--lr',
                        type=float,
                        default=2.5e-4,
                        help='Learning rate.')
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    if not os.path.exists(args.out_dir):
        os.makedirs(args.out_dir)

    experiments.set_log_base_dir(args.out_dir)
    print('Output files are saved in {}'.format(args.out_dir))

    def make_env(render=False, env_seed=0):
        join_tokens = marlo.make("MarLo-FindTheGoal-v0",
                                 params=dict(
                                     allowContinuousMovement=["move", "turn"],
                                     videoResolution=[84, 84],
                                     kill_clients_after_num_rounds=500))
        env = marlo.init(join_tokens[0])

        obs = env.reset()
        if render:
            env.render(mode="rgb_array")
        action = env.action_space.sample()
        obs, r, done, info = env.step(action)
        env.seed(int(env_seed))
        return env

    env = make_env(render=args.render, env_seed=args.seed)

    n_actions = env.action_space.n
    q_func = links.Sequence(links.NatureDQNHead(n_input_channels=3),
                            L.Linear(512, n_actions), DiscreteActionValue)

    # Draw the computational graph and save it in the output directory.
    chainerrl.misc.draw_computational_graph(
        [q_func(np.zeros((3, 84, 84), dtype=np.float32)[None])],
        os.path.join(args.out_dir, 'model'))

    # Use the same hyper parameters as the Nature paper's
    opt = optimizers.RMSpropGraves(lr=args.lr,
                                   alpha=0.95,
                                   momentum=0.0,
                                   eps=1e-2)

    opt.setup(q_func)

    rbuf = replay_buffer.ReplayBuffer(10**6)

    explorer = explorers.LinearDecayEpsilonGreedy(
        1.0, args.final_epsilon, args.final_exploration_frames,
        lambda: np.random.randint(n_actions))

    def phi(x):
        # Feature extractor
        x = x.transpose(2, 0, 1)
        return np.asarray(x, dtype=np.float32) / 255

    agent = agents.DQN(q_func,
                       opt,
                       rbuf,
                       gpu=args.gpu,
                       gamma=0.99,
                       explorer=explorer,
                       replay_start_size=args.replay_start_size,
                       target_update_interval=args.target_update_interval,
                       update_interval=args.update_interval,
                       batch_accumulator='sum',
                       phi=phi)

    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(env=env,
                                                  agent=agent,
                                                  n_runs=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_runs=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.out_dir,
            save_best_so_far_agent=False,
            max_episode_len=args.max_episode_len,
            eval_env=env,
        )
コード例 #15
0
def main():
    import logging

    parser = argparse.ArgumentParser()
    parser.add_argument('processes', type=int)
    parser.add_argument('--env', type=str, default='CartPole-v0')
    parser.add_argument('--seed', type=int, default=None)
    parser.add_argument('--outdir', type=str, default=None)
    parser.add_argument('--t-max', type=int, default=5)
    parser.add_argument('--n-times-replay', type=int, default=8)
    parser.add_argument('--beta', type=float, default=1e-2)
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--steps', type=int, default=8 * 10**7)
    parser.add_argument('--eval-frequency', type=int, default=10**5)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--reward-scale-factor', type=float, default=1e-2)
    parser.add_argument('--rmsprop-epsilon', type=float, default=1e-1)
    parser.add_argument('--render', action='store_true', default=False)
    parser.add_argument('--lr', type=float, default=7e-4)
    parser.add_argument('--weight-decay', type=float, default=0.0)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--logger-level', type=int, default=logging.DEBUG)
    parser.add_argument('--monitor', action='store_true')
    args = parser.parse_args()

    logging.getLogger().setLevel(args.logger_level)

    if args.seed is not None:
        misc.set_random_seed(args.seed)

    args.outdir = experiments.prepare_output_dir(args, args.outdir)

    def make_env(process_idx, test):
        env = gym.make(args.env)
        if args.monitor and process_idx == 0:
            env = gym.wrappers.Monitor(env, args.outdir)
        # Scale rewards observed by agents
        if not test:
            misc.env_modifiers.make_reward_filtered(
                env, lambda x: x * args.reward_scale_factor)
        if args.render and process_idx == 0 and not test:
            misc.env_modifiers.make_rendered(env)
        return env

    sample_env = gym.make(args.env)
    timestep_limit = sample_env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = sample_env.observation_space
    action_space = sample_env.action_space

    n_hidden_channels = 200

    model = acer.ACERSeparateModel(
        pi=links.Sequence(
            L.Linear(obs_space.low.size, n_hidden_channels), F.relu,
            L.Linear(n_hidden_channels, action_space.n, wscale=1e-3),
            SoftmaxDistribution),
        q=links.Sequence(
            L.Linear(obs_space.low.size, n_hidden_channels), F.relu,
            L.Linear(n_hidden_channels, action_space.n, wscale=1e-3),
            DiscreteActionValue),
    )

    opt = rmsprop_async.RMSpropAsync(lr=args.lr,
                                     eps=args.rmsprop_epsilon,
                                     alpha=0.99)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))

    replay_buffer = EpisodicReplayBuffer(10**5 // args.processes)
    agent = acer.DiscreteACER(model,
                              opt,
                              t_max=args.t_max,
                              gamma=0.99,
                              replay_buffer=replay_buffer,
                              n_times_replay=args.n_times_replay,
                              beta=args.beta,
                              phi=phi)
    if args.load:
        agent.load(args.load)

    if args.demo:
        env = make_env(0, True)
        mean, median, stdev = experiments.eval_performance(
            env=env,
            agent=agent,
            n_runs=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev'.format(
            args.eval_n_runs, mean, median, stdev))
    else:
        experiments.train_agent_async(agent=agent,
                                      outdir=args.outdir,
                                      processes=args.processes,
                                      make_env=make_env,
                                      profile=args.profile,
                                      steps=args.steps,
                                      eval_n_runs=args.eval_n_runs,
                                      eval_frequency=args.eval_frequency,
                                      max_episode_len=timestep_limit)
コード例 #16
0
def main():
    import logging

    parser = argparse.ArgumentParser()
    parser.add_argument('processes', type=int)
    parser.add_argument('--env', type=str, default='CartPole-v0')
    parser.add_argument('--seed', type=int, default=None)
    parser.add_argument('--outdir', type=str, default=None)
    parser.add_argument('--t-max', type=int, default=50)
    parser.add_argument('--n-times-replay', type=int, default=4)
    parser.add_argument('--n-hidden-channels', type=int, default=100)
    parser.add_argument('--n-hidden-layers', type=int, default=2)
    parser.add_argument('--replay-capacity', type=int, default=5000)
    parser.add_argument('--replay-start-size', type=int, default=10**3)
    parser.add_argument('--disable-online-update', action='store_true')
    parser.add_argument('--beta', type=float, default=1e-2)
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--steps', type=int, default=8 * 10**7)
    parser.add_argument('--eval-interval', type=int, default=10**5)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--reward-scale-factor', type=float, default=1e-2)
    parser.add_argument('--rmsprop-epsilon', type=float, default=1e-2)
    parser.add_argument('--render', action='store_true', default=False)
    parser.add_argument('--lr', type=float, default=7e-4)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--logger-level', type=int, default=logging.DEBUG)
    parser.add_argument('--monitor', action='store_true')
    parser.add_argument('--truncation-threshold', type=float, default=5)
    parser.add_argument('--trust-region-delta', type=float, default=0.1)
    args = parser.parse_args()

    logging.basicConfig(level=args.logger_level)

    if args.seed is not None:
        misc.set_random_seed(args.seed)

    args.outdir = experiments.prepare_output_dir(args, args.outdir)

    def make_env(process_idx, test):
        env = gym.make(args.env)
        if args.monitor and process_idx == 0:
            env = gym.wrappers.Monitor(env, args.outdir)
        # Scale rewards observed by agents
        if not test:
            misc.env_modifiers.make_reward_filtered(
                env, lambda x: x * args.reward_scale_factor)
        if args.render and process_idx == 0 and not test:
            misc.env_modifiers.make_rendered(env)
        return env

    sample_env = gym.make(args.env)
    timestep_limit = sample_env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = sample_env.observation_space
    action_space = sample_env.action_space

    if isinstance(action_space, spaces.Box):
        model = acer.ACERSDNSeparateModel(
            pi=policies.FCGaussianPolicy(
                obs_space.low.size,
                action_space.low.size,
                n_hidden_channels=args.n_hidden_channels,
                n_hidden_layers=args.n_hidden_layers,
                bound_mean=True,
                min_action=action_space.low,
                max_action=action_space.high),
            v=v_functions.FCVFunction(obs_space.low.size,
                                      n_hidden_channels=args.n_hidden_channels,
                                      n_hidden_layers=args.n_hidden_layers),
            adv=q_functions.FCSAQFunction(
                obs_space.low.size,
                action_space.low.size,
                n_hidden_channels=args.n_hidden_channels // 4,
                n_hidden_layers=args.n_hidden_layers),
        )
    else:
        model = acer.ACERSeparateModel(
            pi=links.Sequence(
                L.Linear(obs_space.low.size, args.n_hidden_channels), F.relu,
                L.Linear(args.n_hidden_channels,
                         action_space.n,
                         initialW=LeCunNormal(1e-3)), SoftmaxDistribution),
            q=links.Sequence(
                L.Linear(obs_space.low.size, args.n_hidden_channels), F.relu,
                L.Linear(args.n_hidden_channels,
                         action_space.n,
                         initialW=LeCunNormal(1e-3)), DiscreteActionValue),
        )

    opt = rmsprop_async.RMSpropAsync(lr=args.lr,
                                     eps=args.rmsprop_epsilon,
                                     alpha=0.99)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))

    replay_buffer = EpisodicReplayBuffer(args.replay_capacity)
    agent = acer.ACER(model,
                      opt,
                      t_max=args.t_max,
                      gamma=0.99,
                      replay_buffer=replay_buffer,
                      n_times_replay=args.n_times_replay,
                      replay_start_size=args.replay_start_size,
                      disable_online_update=args.disable_online_update,
                      use_trust_region=True,
                      trust_region_delta=args.trust_region_delta,
                      truncation_threshold=args.truncation_threshold,
                      beta=args.beta,
                      phi=phi)
    if args.load:
        agent.load(args.load)

    if args.demo:
        env = make_env(0, True)
        eval_stats = experiments.eval_performance(
            env=env,
            agent=agent,
            n_runs=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_async(agent=agent,
                                      outdir=args.outdir,
                                      processes=args.processes,
                                      make_env=make_env,
                                      profile=args.profile,
                                      steps=args.steps,
                                      eval_n_runs=args.eval_n_runs,
                                      eval_interval=args.eval_interval,
                                      max_episode_len=timestep_limit)
コード例 #17
0
def main(args):
    import logging
    logging.basicConfig(level=logging.INFO, filename='log')

    if (type(args) is list):
        args = make_args(args)
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)

    # Set a random seed used in ChainerRL.
    # If you use more than one processes, the results will be no longer
    # deterministic even with the same random seed.
    misc.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.processes) + args.seed * args.processes
    assert process_seeds.max() < 2**31

    def make_env(process_idx, test):
        # Use different random seeds for train and test envs
        process_seed = process_seeds[process_idx]
        env_seed = 2**31 - 1 - process_seed if test else process_seed
        env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(
            args.env, max_frames=args.max_frames),
                                           episode_life=not test,
                                           clip_rewards=not test)
        env.seed(int(env_seed))
        if test:
            # Randomize actions like epsilon-greedy in evaluation as well
            env = chainerrl.wrappers.RandomizeAction(env, 0.05)
        if args.monitor:
            env = chainerrl.wrappers.Monitor(
                env, args.outdir, mode='evaluation' if test else 'training')
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    sample_env = make_env(0, test=False)
    action_space = sample_env.action_space
    assert isinstance(action_space, spaces.Discrete)

    # Define a model and its optimizer
    q_func = links.Sequence(links.NIPSDQNHead(), L.Linear(256, action_space.n),
                            DiscreteActionValue)
    opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=1e-1, alpha=0.99)
    opt.setup(q_func)

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    # Make process-specific agents to diversify exploration
    def make_agent(process_idx):
        # Random epsilon assignment described in the original paper
        rand = random.random()
        if rand < 0.4:
            epsilon_target = 0.1
        elif rand < 0.7:
            epsilon_target = 0.01
        else:
            epsilon_target = 0.5
        explorer = explorers.LinearDecayEpsilonGreedy(
            1, epsilon_target, args.final_exploration_frames,
            action_space.sample)
        # Suppress the explorer logger
        explorer.logger.setLevel(logging.INFO)
        return nsq.NSQ(q_func,
                       opt,
                       t_max=5,
                       gamma=0.99,
                       i_target=40000,
                       explorer=explorer,
                       phi=phi)

    if args.demo:
        env = make_env(0, True)
        agent = make_agent(0)
        eval_stats = experiments.eval_performance(env=env,
                                                  agent=agent,
                                                  n_steps=None,
                                                  n_episodes=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            agent.optimizer.lr = value

        lr_decay_hook = experiments.LinearInterpolationHook(
            args.steps, args.lr, 0, lr_setter)

        experiments.train_agent_async(
            outdir=args.outdir,
            processes=args.processes,
            make_env=make_env,
            make_agent=make_agent,
            profile=args.profile,
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            global_step_hooks=[lr_decay_hook],
            save_best_so_far_agent=False,
        )
コード例 #18
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env',
                        type=str,
                        default='BreakoutNoFrameskip-v4',
                        help='OpenAI Atari domain to perform algorithm on.')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--gpu',
                        type=int,
                        default=0,
                        help='GPU to use, set to -1 if no GPU.')
    parser.add_argument('--load', type=str, default=None, required=True)
    parser.add_argument('--logging-level',
                        type=int,
                        default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--render',
                        action='store_true',
                        default=False,
                        help='Render env states in a GUI window.')
    parser.add_argument('--monitor',
                        action='store_true',
                        default=False,
                        help='Monitor env. Videos and additional information'
                        ' are saved as output files.')
    parser.add_argument('--steps',
                        type=int,
                        default=5 * 10**7,
                        help='Total number of demo timesteps to collect')
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    def make_env():
        env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(
            args.env, max_frames=None),
                                           episode_life=False,
                                           clip_rewards=False)
        env.seed(int(args.seed))
        # Randomize actions like epsilon-greedy
        env = chainerrl.wrappers.RandomizeAction(env, 0.01)
        if args.monitor:
            env = chainerrl.wrappers.Monitor(env,
                                             args.outdir,
                                             mode='evaluation')
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    env = make_env()

    n_actions = env.action_space.n
    q_func = links.Sequence(links.NatureDQNHead(), L.Linear(512, n_actions),
                            DiscreteActionValue)

    # Draw the computational graph and save it in the output directory.
    chainerrl.misc.draw_computational_graph(
        [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])],
        os.path.join(args.outdir, 'model'))

    # The optimizer and replay buffer are dummy variables required by agent
    opt = optimizers.RMSpropGraves()
    opt.setup(q_func)
    rbuf = replay_buffer.ReplayBuffer(1)

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    Agent = agents.DQN
    agent = Agent(q_func,
                  opt,
                  rbuf,
                  gpu=args.gpu,
                  gamma=0.99,
                  explorer=None,
                  replay_start_size=1,
                  minibatch_size=1,
                  target_update_interval=None,
                  clip_delta=True,
                  update_interval=4,
                  phi=phi)

    agent.load(args.load)

    # saves demos to outdir/demos.pickle
    experiments.collect_demonstrations(agent=agent,
                                       env=env,
                                       steps=args.steps,
                                       episodes=None,
                                       outdir=args.outdir,
                                       max_episode_len=None)
コード例 #19
0
ファイル: train_dqn.py プロジェクト: zhjmcjk/chainerrl
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env',
                        type=str,
                        default='BreakoutNoFrameskip-v4',
                        help='OpenAI Atari domain to perform algorithm on.')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--gpu',
                        type=int,
                        default=0,
                        help='GPU to use, set to -1 if no GPU.')
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--final-exploration-frames',
                        type=int,
                        default=10**6,
                        help='Timesteps after which we stop ' +
                        'annealing exploration rate')
    parser.add_argument('--final-epsilon',
                        type=float,
                        default=0.1,
                        help='Final value of epsilon during training.')
    parser.add_argument('--eval-epsilon',
                        type=float,
                        default=0.05,
                        help='Exploration epsilon used during eval episodes.')
    parser.add_argument('--noisy-net-sigma', type=float, default=None)
    parser.add_argument('--arch',
                        type=str,
                        default='doubledqn',
                        choices=['nature', 'nips', 'dueling', 'doubledqn'],
                        help='Network architecture to use.')
    parser.add_argument('--steps',
                        type=int,
                        default=5 * 10**7,
                        help='Total number of timesteps to train the agent.')
    parser.add_argument(
        '--max-frames',
        type=int,
        default=30 * 60 * 60,  # 30 minutes with 60 fps
        help='Maximum number of frames for each episode.')
    parser.add_argument('--replay-start-size',
                        type=int,
                        default=5 * 10**4,
                        help='Minimum replay buffer size before ' +
                        'performing gradient updates.')
    parser.add_argument('--target-update-interval',
                        type=int,
                        default=1 * 10**4,
                        help='Frequency (in timesteps) at which ' +
                        'the target network is updated.')
    parser.add_argument('--eval-interval',
                        type=int,
                        default=10**5,
                        help='Frequency (in timesteps) of evaluation phase.')
    parser.add_argument('--update-interval',
                        type=int,
                        default=4,
                        help='Frequency (in timesteps) of network updates.')
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--no-clip-delta',
                        dest='clip_delta',
                        action='store_false')
    parser.set_defaults(clip_delta=True)

    parser.add_argument('--logging-level',
                        type=int,
                        default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--render',
                        action='store_true',
                        default=False,
                        help='Render env states in a GUI window.')
    parser.add_argument('--monitor',
                        action='store_true',
                        default=False,
                        help='Monitor env. Videos and additional information'
                        ' are saved as output files.')
    parser.add_argument('--lr',
                        type=float,
                        default=2.5e-4,
                        help='Learning rate.')
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    # Set different random seeds for train and test envs.
    train_seed = args.seed
    test_seed = 2**31 - 1 - args.seed

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    def make_env(test):
        # Use different random seeds for train and test envs
        env_seed = test_seed if test else train_seed
        env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(
            args.env, max_frames=args.max_frames),
                                           episode_life=not test,
                                           clip_rewards=not test)
        env.seed(int(env_seed))
        if test:
            # Randomize actions like epsilon-greedy in evaluation as well
            env = chainerrl.wrappers.RandomizeAction(env, args.eval_epsilon)
        if args.monitor:
            env = gym.wrappers.Monitor(
                env, args.outdir, mode='evaluation' if test else 'training')
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    eval_env = make_env(test=True)

    n_actions = env.action_space.n
    q_func = links.Sequence(links.NatureDQNHead(), L.Linear(512, n_actions),
                            DiscreteActionValue)

    if args.noisy_net_sigma is not None:
        links.to_factorized_noisy(q_func)
        # Turn off explorer
        explorer = explorers.Greedy()

    # Draw the computational graph and save it in the output directory.
    chainerrl.misc.draw_computational_graph(
        [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])],
        os.path.join(args.outdir, 'model'))

    # Use the same hyper parameters as the Nature paper's
    opt = optimizers.RMSpropGraves(lr=args.lr,
                                   alpha=0.95,
                                   momentum=0.0,
                                   eps=1e-2)

    opt.setup(q_func)

    rbuf = replay_buffer.ReplayBuffer(10**6)

    explorer = explorers.LinearDecayEpsilonGreedy(
        1.0, args.final_epsilon, args.final_exploration_frames,
        lambda: np.random.randint(n_actions))

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    Agent = agents.DQN
    agent = Agent(q_func,
                  opt,
                  rbuf,
                  gpu=args.gpu,
                  gamma=0.99,
                  explorer=explorer,
                  replay_start_size=args.replay_start_size,
                  target_update_interval=args.target_update_interval,
                  clip_delta=args.clip_delta,
                  update_interval=args.update_interval,
                  batch_accumulator='sum',
                  phi=phi)

    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(env=eval_env,
                                                  agent=agent,
                                                  n_runs=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            save_best_so_far_agent=False,
            eval_env=eval_env,
        )
コード例 #20
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--out_dir', type=str, default='results',
                        help='Directory path to save output files.'
                             ' If it does not exist, it will be created.')
    parser.add_argument('--seed', type=int, default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--gpu', type=int, default=0,
                        help='GPU to use, set to -1 if no GPU.')
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--final-exploration-frames',
                        type=int, default=10 ** 5,
                        help='Timesteps after which we stop ' +
                        'annealing exploration rate')
    parser.add_argument('--final-epsilon', type=float, default=0.1,
                        help='Final value of epsilon during training.')
    parser.add_argument('--eval-epsilon', type=float, default=0.05,
                        help='Exploration epsilon used during eval episodes.')
    parser.add_argument('--steps', type=int, default=10 ** 6,
                        help='Total number of timesteps to train the agent.')
    parser.add_argument('--max-episode-len', type=int,
                        default=30 * 60 * 60 // 4,  # 30 minutes with 60/4 fps
                        help='Maximum number of timesteps for each episode.')
    parser.add_argument('--replay-start-size', type=int, default=1000,
                        help='Minimum replay buffer size before ' +
                        'performing gradient updates.')
    parser.add_argument('--target-update-interval',
                        type=int, default=1 * 10 ** 4,
                        help='Frequency (in timesteps) at which ' +
                        'the target network is updated.')
    parser.add_argument('--eval-interval', type=int, default=10 ** 5,
                        help='Frequency (in timesteps) of evaluation phase.')
    parser.add_argument('--update-interval', type=int, default=4,
                        help='Frequency (in timesteps) of network updates.')
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--logging-level', type=int, default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--lr', type=float, default=2.5e-4,
                        help='Learning rate.')
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu,))

    if not os.path.exists(args.out_dir):
        os.makedirs(args.out_dir)

    experiments.set_log_base_dir(args.out_dir)
    print('Output files are saved in {}'.format(args.out_dir))

    env = make_env(env_seed=args.seed)

    n_actions = env.action_space.n
    
    q_func = links.Sequence(
        links.NatureDQNHead(n_input_channels=3),
        L.Linear(512, n_actions),
        DiscreteActionValue
    )

    # Use the same hyper parameters as the Nature paper's
    opt = optimizers.RMSpropGraves(
        lr=args.lr, alpha=0.95, momentum=0.0, eps=1e-2)

    opt.setup(q_func)

    rbuf = replay_buffer.ReplayBuffer(10 ** 6)

    explorer = explorers.LinearDecayEpsilonGreedy(
        1.0, args.final_epsilon,
        args.final_exploration_frames,
        lambda: np.random.randint(n_actions))

    def phi(x):
        # Feature extractor
        x = x.transpose(2, 0, 1)
        return np.asarray(x, dtype=np.float32) / 255

    agent = agents.DQN(
        q_func,
        opt,
        rbuf,
        gpu=args.gpu,
        gamma=0.99,
        explorer=explorer,
        replay_start_size=args.replay_start_size,
        target_update_interval=args.target_update_interval,
        update_interval=args.update_interval,
        batch_accumulator='sum',
        phi=phi
    )

    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=env,
            agent=agent,
            n_runs=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_runs=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.out_dir,
            save_best_so_far_agent=False,
            max_episode_len=args.max_episode_len,
            eval_env=env,
        )
コード例 #21
0
def main():
    import logging

    parser = argparse.ArgumentParser()
    parser.add_argument('processes', type=int)
    parser.add_argument('--env', type=str, default='CartPole-v0')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 32)')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--t-max', type=int, default=50)
    parser.add_argument('--n-times-replay', type=int, default=4)
    parser.add_argument('--n-hidden-channels', type=int, default=100)
    parser.add_argument('--n-hidden-layers', type=int, default=2)
    parser.add_argument('--replay-capacity', type=int, default=5000)
    parser.add_argument('--replay-start-size', type=int, default=10**3)
    parser.add_argument('--disable-online-update', action='store_true')
    parser.add_argument('--beta', type=float, default=1e-2)
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--steps', type=int, default=8 * 10**7)
    parser.add_argument('--eval-interval', type=int, default=10**5)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--reward-scale-factor', type=float, default=1e-2)
    parser.add_argument('--rmsprop-epsilon', type=float, default=1e-2)
    parser.add_argument('--render', action='store_true', default=False)
    parser.add_argument('--lr', type=float, default=7e-4)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--logger-level', type=int, default=logging.DEBUG)
    parser.add_argument('--monitor', action='store_true')
    parser.add_argument('--truncation-threshold', type=float, default=5)
    parser.add_argument('--trust-region-delta', type=float, default=0.1)
    args = parser.parse_args()

    logging.basicConfig(level=args.logger_level)

    # Set a random seed used in ChainerRL.
    # If you use more than one processes, the results will be no longer
    # deterministic even with the same random seed.
    misc.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.processes) + args.seed * args.processes
    assert process_seeds.max() < 2**32

    args.outdir = experiments.prepare_output_dir(args, args.outdir)

    def make_env(process_idx, test):
        env = gym.make(args.env)
        # Use different random seeds for train and test envs
        process_seed = int(process_seeds[process_idx])
        env_seed = 2**32 - 1 - process_seed if test else process_seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if args.monitor and process_idx == 0:
            env = chainerrl.wrappers.Monitor(env, args.outdir)
        if not test:
            # Scale rewards (and thus returns) to a reasonable range so that
            # training is easier
            env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor)
        if args.render and process_idx == 0 and not test:
            env = chainerrl.wrappers.Render(env)
        return env

    sample_env = gym.make(args.env)
    timestep_limit = sample_env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = sample_env.observation_space
    action_space = sample_env.action_space

    if isinstance(action_space, spaces.Box):
        model = acer.ACERSDNSeparateModel(
            pi=policies.FCGaussianPolicy(
                obs_space.low.size,
                action_space.low.size,
                n_hidden_channels=args.n_hidden_channels,
                n_hidden_layers=args.n_hidden_layers,
                bound_mean=True,
                min_action=action_space.low,
                max_action=action_space.high),
            v=v_functions.FCVFunction(obs_space.low.size,
                                      n_hidden_channels=args.n_hidden_channels,
                                      n_hidden_layers=args.n_hidden_layers),
            adv=q_functions.FCSAQFunction(
                obs_space.low.size,
                action_space.low.size,
                n_hidden_channels=args.n_hidden_channels // 4,
                n_hidden_layers=args.n_hidden_layers),
        )
    else:
        model = acer.ACERSeparateModel(
            pi=links.Sequence(
                L.Linear(obs_space.low.size, args.n_hidden_channels), F.relu,
                L.Linear(args.n_hidden_channels,
                         action_space.n,
                         initialW=LeCunNormal(1e-3)), SoftmaxDistribution),
            q=links.Sequence(
                L.Linear(obs_space.low.size, args.n_hidden_channels), F.relu,
                L.Linear(args.n_hidden_channels,
                         action_space.n,
                         initialW=LeCunNormal(1e-3)), DiscreteActionValue),
        )

    opt = rmsprop_async.RMSpropAsync(lr=args.lr,
                                     eps=args.rmsprop_epsilon,
                                     alpha=0.99)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))

    replay_buffer = EpisodicReplayBuffer(args.replay_capacity)
    agent = acer.ACER(model,
                      opt,
                      t_max=args.t_max,
                      gamma=0.99,
                      replay_buffer=replay_buffer,
                      n_times_replay=args.n_times_replay,
                      replay_start_size=args.replay_start_size,
                      disable_online_update=args.disable_online_update,
                      use_trust_region=True,
                      trust_region_delta=args.trust_region_delta,
                      truncation_threshold=args.truncation_threshold,
                      beta=args.beta)
    if args.load:
        agent.load(args.load)

    if args.demo:
        env = make_env(0, True)
        eval_stats = experiments.eval_performance(
            env=env,
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_async(agent=agent,
                                      outdir=args.outdir,
                                      processes=args.processes,
                                      make_env=make_env,
                                      profile=args.profile,
                                      steps=args.steps,
                                      eval_n_steps=None,
                                      eval_n_episodes=args.eval_n_runs,
                                      eval_interval=args.eval_interval,
                                      max_episode_len=timestep_limit)
コード例 #22
0
def convert_to_compatible_model(agent):
    return links.Sequence(*list(agent.model.children()))
コード例 #23
0
    def __init__(self, alg, env, model_path):
        self.alg = alg
        seed = 0
        n_actions = gym.make(env).action_space.n
        gpus = [-1]
        gpu = None
        misc.set_random_seed(seed, gpus=gpus)
        if alg == "DQN-C":
            model = links.Sequence(
                links.NatureDQNHead(),
                L.Linear(512, n_actions),
                DiscreteActionValue)
        if alg == "PPO":
            winit_last = chainer.initializers.LeCunNormal(1e-2)
            model = chainer.Sequential(
                L.Convolution2D(None, 32, 8, stride=4),
                F.relu,
                L.Convolution2D(None, 64, 4, stride=2),
                F.relu,
                L.Convolution2D(None, 64, 3, stride=1),
                F.relu,
                L.Linear(None, 512),
                F.relu,
                links.Branched(
                    chainer.Sequential(
                        L.Linear(None, n_actions, initialW=winit_last),
                        SoftmaxDistribution,
                    ),
                    L.Linear(None, 1),
                )
            )
        if alg == "C51":
            n_atoms = 51
            v_max = 10
            v_min = -10
            model = links.Sequence(
                links.NatureDQNHead(),
                DistributionalFCStateQFunctionWithDiscreteAction(
                    None, n_actions, n_atoms, v_min, v_max,
                    n_hidden_channels=0, n_hidden_layers=0),
            )
        if alg == "ACER":
            model = agents.acer.ACERSharedModel(
                shared=links.Sequence(
                    links.NIPSDQNHead(),
                    L.LSTM(256, 256)),
                pi=links.Sequence(
                    L.Linear(256, n_actions),
                    SoftmaxDistribution),
                q=links.Sequence(
                    L.Linear(256, n_actions),
                    DiscreteActionValue),
            )
        if alg == "A3C":
            model = A3CFF(n_actions)
        if alg == "Rainbow":
            n_atoms = 51
            v_max = 10
            v_min = -10
            model = DistributionalDuelingDQN(n_actions, n_atoms, v_min, v_max)
            links.to_factorized_noisy(model, sigma_scale=0.5)
        if alg == "IQN":
            model = agents.iqn.ImplicitQuantileQFunction(
                psi=chainerrl.links.Sequence(
                    L.Convolution2D(None, 32, 8, stride=4),
                    F.relu,
                    L.Convolution2D(None, 64, 4, stride=2),
                    F.relu,
                    L.Convolution2D(None, 64, 3, stride=1),
                    F.relu,
                    functools.partial(F.reshape, shape=(-1, 3136)),
                ),
                phi=chainerrl.links.Sequence(
                    chainerrl.agents.iqn.CosineBasisLinear(64, 3136),
                    F.relu,
                ),
                f=chainerrl.links.Sequence(
                    L.Linear(None, 512),
                    F.relu,
                    L.Linear(None, n_actions),
                ),
            )
        if alg in ["A3C"]:
            fake_obs = chainer.Variable(
                np.zeros((4, 84, 84), dtype=np.float32)[None],
                name='observation')
            with chainerrl.recurrent.state_reset(model):
                # The state of the model is reset again after drawing the graph
                variables = misc.collect_variables([model(fake_obs)])
                chainer.computational_graph.build_computational_graph(variables)
        elif alg in ["Rainbow", "DQN-C", "C51", "ACER", "PPO"]:
            variables = misc.collect_variables([model(np.zeros((4, 84, 84), dtype=np.float32)[None])])
            chainer.computational_graph.build_computational_graph(variables)
        else:
            fake_obs = np.zeros((4, 84, 84), dtype=np.float32)[None]
            fake_taus = np.zeros(32, dtype=np.float32)[None]
            variables = misc.collect_variables([model(fake_obs)(fake_taus)])

        def phi(x):
            # Feature extractor
            return np.asarray(x, dtype=np.float32) / 255

        opt = optimizers.RMSpropGraves()
        opt.setup(model)
        rbuf = replay_buffer.ReplayBuffer(1)
        if alg == "IQN":
            self.agent = agents.IQN(model, opt, rbuf, gpu=gpu, gamma=0.99, act_deterministically=True, explorer=None,
                                    replay_start_size=1, minibatch_size=1, target_update_interval=None, clip_delta=True,
                                    update_interval=4, phi=phi)
        if alg == "A3C":
            self.agent = a3c.A3C(model, opt, t_max=5, gamma=0.99, phi=phi, act_deterministically=True)
        if alg == "Rainbow":
            self.agent = agents.CategoricalDoubleDQN(model, opt, rbuf, gpu=gpu, gamma=0.99, explorer=None,
                                                     replay_start_size=1, minibatch_size=1, target_update_interval=None,
                                                     clip_delta=True, update_interval=4, phi=phi)
        if alg == "DQN-C":
            self.agent = agents.DQN(model, opt, rbuf, gpu=gpu, gamma=0.99, explorer=None, replay_start_size=1,
                                    minibatch_size=1, target_update_interval=None, clip_delta=True, update_interval=4,
                                    phi=phi)
        if alg == "C51":
            self.agent = agents.CategoricalDQN(
                model, opt, rbuf, gpu=gpu, gamma=0.99,
                explorer=None, replay_start_size=1,
                minibatch_size=1,
                target_update_interval=None,
                clip_delta=True,
                update_interval=4,
                phi=phi,
            )
        if alg == "ACER":
            self.agent = agents.acer.ACER(model, opt, t_max=5, gamma=0.99,
                                          replay_buffer=rbuf,
                                          n_times_replay=4,
                                          replay_start_size=1,
                                          act_deterministically=True,
                                          phi=phi
                                          )
        if alg == "PPO":
            self.agent = agents.PPO(model, opt, gpu=gpu, phi=phi, update_interval=4, minibatch_size=1, clip_eps=0.1,
                                    recurrent=False, act_deterministically=True)
        self.agent.load(os.path.join(model_path, 'chainer', alg, env.replace("NoFrameskip-v4", ""), 'final'))
コード例 #24
0
ファイル: train_nsq_ale.py プロジェクト: Aserun/chainerrl
def main():

    import logging
    logging.basicConfig(level=logging.DEBUG)

    parser = argparse.ArgumentParser()
    parser.add_argument('processes', type=int)
    parser.add_argument('rom', type=str)
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--lr', type=float, default=7e-4)
    parser.add_argument('--steps', type=int, default=8 * 10**7)
    parser.add_argument('--use-sdl', action='store_true', default=False)
    parser.add_argument('--final-exploration-frames',
                        type=int,
                        default=4 * 10**6)
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--eval-interval', type=int, default=10**6)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    args = parser.parse_args()

    # Set a random seed used in ChainerRL.
    # If you use more than one processes, the results will be no longer
    # deterministic even with the same random seed.
    misc.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.processes) + args.seed * args.processes
    assert process_seeds.max() < 2**31

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    def make_env(process_idx, test):
        # Use different random seeds for train and test envs
        process_seed = process_seeds[process_idx]
        env_seed = 2**31 - 1 - process_seed if test else process_seed
        env = ale.ALE(args.rom,
                      use_sdl=args.use_sdl,
                      treat_life_lost_as_terminal=not test,
                      seed=env_seed)
        if not test:
            misc.env_modifiers.make_reward_clipped(env, -1, 1)
        return env

    sample_env = make_env(0, test=False)
    action_space = sample_env.action_space
    assert isinstance(action_space, spaces.Discrete)

    # Define a model and its optimizer
    q_func = links.Sequence(links.NIPSDQNHead(), L.Linear(256, action_space.n),
                            DiscreteActionValue)
    opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=1e-1, alpha=0.99)
    opt.setup(q_func)

    # Make process-specific agents to diversify exploration
    def make_agent(process_idx):
        # Random epsilon assignment described in the original paper
        rand = random.random()
        if rand < 0.4:
            epsilon_target = 0.1
        elif rand < 0.7:
            epsilon_target = 0.01
        else:
            epsilon_target = 0.5
        explorer = explorers.LinearDecayEpsilonGreedy(
            1, epsilon_target, args.final_exploration_frames,
            action_space.sample)
        # Suppress the explorer logger
        explorer.logger.setLevel(logging.INFO)
        return nsq.NSQ(q_func,
                       opt,
                       t_max=5,
                       gamma=0.99,
                       i_target=40000,
                       explorer=explorer,
                       phi=dqn_phi)

    if args.demo:
        env = make_env(0, True)
        agent = make_agent(0)
        eval_stats = experiments.eval_performance(env=env,
                                                  agent=agent,
                                                  n_runs=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        explorer = explorers.ConstantEpsilonGreedy(0.05, action_space.sample)

        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            agent.optimizer.lr = value

        lr_decay_hook = experiments.LinearInterpolationHook(
            args.steps, args.lr, 0, lr_setter)

        experiments.train_agent_async(outdir=args.outdir,
                                      processes=args.processes,
                                      make_env=make_env,
                                      make_agent=make_agent,
                                      profile=args.profile,
                                      steps=args.steps,
                                      eval_n_runs=args.eval_n_runs,
                                      eval_interval=args.eval_interval,
                                      eval_explorer=explorer,
                                      global_step_hooks=[lr_decay_hook])
コード例 #25
0
ファイル: train_acer_ale.py プロジェクト: ypxie/chainerrl
def main():

    # Prevent numpy from using multiple threads
    os.environ['OMP_NUM_THREADS'] = '1'

    import logging
    logging.basicConfig(level=logging.DEBUG)

    parser = argparse.ArgumentParser()
    parser.add_argument('processes', type=int)
    parser.add_argument('rom', type=str)
    parser.add_argument('--seed', type=int, default=None)
    parser.add_argument('--outdir', type=str, default=None)
    parser.add_argument('--use-sdl', action='store_true')
    parser.add_argument('--t-max', type=int, default=5)
    parser.add_argument('--replay-start-size', type=int, default=10000)
    parser.add_argument('--n-times-replay', type=int, default=4)
    parser.add_argument('--max-episode-len', type=int, default=10000)
    parser.add_argument('--beta', type=float, default=1e-2)
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--steps', type=int, default=8 * 10**7)
    parser.add_argument('--lr', type=float, default=7e-4)
    parser.add_argument('--eval-frequency', type=int, default=10**6)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--weight-decay', type=float, default=0.0)
    parser.add_argument('--use-lstm', action='store_true')
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.set_defaults(use_sdl=False)
    parser.set_defaults(use_lstm=False)
    args = parser.parse_args()

    if args.seed is not None:
        misc.set_random_seed(args.seed)

    args.outdir = experiments.prepare_output_dir(args, args.outdir)

    print('Output files are saved in {}'.format(args.outdir))

    n_actions = ale.ALE(args.rom).number_of_actions

    if args.use_lstm:
        model = acer.ACERSharedModel(
            shared=links.Sequence(links.NIPSDQNHead(), L.LSTM(256, 256)),
            pi=links.Sequence(L.Linear(256, n_actions), SoftmaxDistribution),
            q=links.Sequence(L.Linear(256, n_actions), DiscreteActionValue),
        )
    else:
        model = acer.ACERSharedModel(
            shared=links.NIPSDQNHead(),
            pi=links.Sequence(L.Linear(256, n_actions), SoftmaxDistribution),
            q=links.Sequence(L.Linear(256, n_actions), DiscreteActionValue),
        )
    opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=4e-3, alpha=0.99)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))
    replay_buffer = EpisodicReplayBuffer(10**6 // args.processes)
    agent = acer.ACER(model,
                      opt,
                      t_max=args.t_max,
                      gamma=0.99,
                      replay_buffer=replay_buffer,
                      n_times_replay=args.n_times_replay,
                      replay_start_size=args.replay_start_size,
                      beta=args.beta,
                      phi=dqn_phi)

    if args.load:
        agent.load(args.load)

    def make_env(process_idx, test):
        env = ale.ALE(args.rom,
                      use_sdl=args.use_sdl,
                      treat_life_lost_as_terminal=not test)
        if not test:
            misc.env_modifiers.make_reward_clipped(env, -1, 1)

        return env

    if args.demo:
        env = make_env(0, True)
        mean, median, stdev = experiments.eval_performance(
            env=env, agent=agent, n_runs=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev'.format(
            args.eval_n_runs, mean, median, stdev))
    else:
        experiments.train_agent_async(agent=agent,
                                      outdir=args.outdir,
                                      processes=args.processes,
                                      make_env=make_env,
                                      profile=args.profile,
                                      steps=args.steps,
                                      eval_n_runs=args.eval_n_runs,
                                      eval_frequency=args.eval_frequency,
                                      max_episode_len=args.max_episode_len)
コード例 #26
0
ファイル: train_nsq_ale.py プロジェクト: zhexiaozhe/chainerrl
def main():

    # This prevents numpy from using multiple threads
    os.environ['OMP_NUM_THREADS'] = '1'

    import logging
    # logging.basicConfig(level=logging.DEBUG)

    parser = argparse.ArgumentParser()
    parser.add_argument('processes', type=int)
    parser.add_argument('rom', type=str)
    parser.add_argument('--seed', type=int, default=None)
    parser.add_argument('--lr', type=float, default=7e-4)
    parser.add_argument('--steps', type=int, default=8 * 10**7)
    parser.add_argument('--use-sdl', action='store_true', default=False)
    parser.add_argument('--final-exploration-frames',
                        type=int,
                        default=4 * 10**6)
    parser.add_argument('--outdir', type=str, default='nsq_output')
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--eval-interval', type=int, default=10**6)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    args = parser.parse_args()

    if args.seed is not None:
        misc.set_random_seed(args.seed)

    args.outdir = experiments.prepare_output_dir(args, args.outdir)

    print('Output files are saved in {}'.format(args.outdir))

    def make_env(process_idx, test):
        env = ale.ALE(args.rom,
                      use_sdl=args.use_sdl,
                      treat_life_lost_as_terminal=not test)
        if not test:
            misc.env_modifiers.make_reward_clipped(env, -1, 1)
        return env

    sample_env = make_env(0, test=False)
    action_space = sample_env.action_space
    assert isinstance(action_space, spaces.Discrete)

    # Define a model and its optimizer
    q_func = links.Sequence(links.NIPSDQNHead(), L.Linear(256, action_space.n),
                            DiscreteActionValue)
    opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=1e-1, alpha=0.99)
    opt.setup(q_func)

    # Make process-specific agents to diversify exploration
    def make_agent(process_idx):
        # Random epsilon assignment described in the original paper
        rand = random.random()
        if rand < 0.4:
            epsilon_target = 0.1
        elif rand < 0.7:
            epsilon_target = 0.01
        else:
            epsilon_target = 0.5
        explorer = explorers.LinearDecayEpsilonGreedy(
            1, epsilon_target, args.final_exploration_frames,
            action_space.sample)
        # Suppress the explorer logger
        explorer.logger.setLevel(logging.INFO)
        return nsq.NSQ(q_func,
                       opt,
                       t_max=5,
                       gamma=0.99,
                       i_target=40000,
                       explorer=explorer,
                       phi=dqn_phi)

    if args.demo:
        env = make_env(0, True)
        agent = make_agent(0)
        eval_stats = experiments.eval_performance(env=env,
                                                  agent=agent,
                                                  n_runs=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        explorer = explorers.ConstantEpsilonGreedy(0.05, action_space.sample)

        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            agent.optimizer.lr = value

        lr_decay_hook = experiments.LinearInterpolationHook(
            args.steps, args.lr, 0, lr_setter)

        experiments.train_agent_async(outdir=args.outdir,
                                      processes=args.processes,
                                      make_env=make_env,
                                      make_agent=make_agent,
                                      profile=args.profile,
                                      steps=args.steps,
                                      eval_n_runs=args.eval_n_runs,
                                      eval_interval=args.eval_interval,
                                      eval_explorer=explorer,
                                      global_step_hooks=[lr_decay_hook])
コード例 #27
0
def main():

    import logging
    logging.basicConfig(level=logging.DEBUG)

    parser = argparse.ArgumentParser()
    parser.add_argument('processes', type=int)
    parser.add_argument('rom', type=str)
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--use-sdl', action='store_true')
    parser.add_argument('--t-max', type=int, default=5)
    parser.add_argument('--replay-start-size', type=int, default=10000)
    parser.add_argument('--n-times-replay', type=int, default=4)
    parser.add_argument('--max-episode-len', type=int, default=10000)
    parser.add_argument('--beta', type=float, default=1e-2)
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--steps', type=int, default=8 * 10**7)
    parser.add_argument('--lr', type=float, default=7e-4)
    parser.add_argument('--eval-interval', type=int, default=10**6)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--weight-decay', type=float, default=0.0)
    parser.add_argument('--use-lstm', action='store_true')
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.set_defaults(use_sdl=False)
    parser.set_defaults(use_lstm=False)
    args = parser.parse_args()

    # Set a random seed used in ChainerRL.
    # If you use more than one processes, the results will be no longer
    # deterministic even with the same random seed.
    misc.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.processes) + args.seed * args.processes
    assert process_seeds.max() < 2**31

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    n_actions = ale.ALE(args.rom).number_of_actions

    if args.use_lstm:
        model = acer.ACERSharedModel(
            shared=links.Sequence(links.NIPSDQNHead(), L.LSTM(256, 256)),
            pi=links.Sequence(L.Linear(256, n_actions), SoftmaxDistribution),
            q=links.Sequence(L.Linear(256, n_actions), DiscreteActionValue),
        )
    else:
        model = acer.ACERSharedModel(
            shared=links.NIPSDQNHead(),
            pi=links.Sequence(L.Linear(256, n_actions), SoftmaxDistribution),
            q=links.Sequence(L.Linear(256, n_actions), DiscreteActionValue),
        )
    opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=4e-3, alpha=0.99)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))
    replay_buffer = EpisodicReplayBuffer(10**6 // args.processes)
    agent = acer.ACER(model,
                      opt,
                      t_max=args.t_max,
                      gamma=0.99,
                      replay_buffer=replay_buffer,
                      n_times_replay=args.n_times_replay,
                      replay_start_size=args.replay_start_size,
                      beta=args.beta,
                      phi=dqn_phi)

    if args.load:
        agent.load(args.load)

    def make_env(process_idx, test):
        # Use different random seeds for train and test envs
        process_seed = process_seeds[process_idx]
        env_seed = 2**31 - 1 - process_seed if test else process_seed
        env = ale.ALE(args.rom,
                      use_sdl=args.use_sdl,
                      treat_life_lost_as_terminal=not test,
                      seed=env_seed)
        if not test:
            misc.env_modifiers.make_reward_clipped(env, -1, 1)
        return env

    if args.demo:
        env = make_env(0, True)
        eval_stats = experiments.eval_performance(env=env,
                                                  agent=agent,
                                                  n_runs=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:

        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            agent.optimizer.lr = value

        lr_decay_hook = experiments.LinearInterpolationHook(
            args.steps, args.lr, 0, lr_setter)

        experiments.train_agent_async(agent=agent,
                                      outdir=args.outdir,
                                      processes=args.processes,
                                      make_env=make_env,
                                      profile=args.profile,
                                      steps=args.steps,
                                      eval_n_runs=args.eval_n_runs,
                                      eval_interval=args.eval_interval,
                                      max_episode_len=args.max_episode_len,
                                      global_step_hooks=[lr_decay_hook])
コード例 #28
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('processes', type=int)
    parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--lr', type=float, default=7e-4)
    parser.add_argument('--steps', type=int, default=8 * 10**7)
    parser.add_argument(
        '--max-episode-len',
        type=int,
        default=5 * 60 * 60 // 4,  # 5 minutes with 60/4 fps
        help='Maximum number of steps for each episode.')
    parser.add_argument('--final-exploration-frames',
                        type=int,
                        default=4 * 10**6)
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--eval-interval', type=int, default=10**6)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--logging-level',
                        type=int,
                        default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--render',
                        action='store_true',
                        default=False,
                        help='Render env states in a GUI window.')
    parser.add_argument('--monitor',
                        action='store_true',
                        default=False,
                        help='Monitor env. Videos and additional information'
                        ' are saved as output files.')
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    # If you use more than one processes, the results will be no longer
    # deterministic even with the same random seed.
    misc.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.processes) + args.seed * args.processes
    assert process_seeds.max() < 2**31

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    def make_env(process_idx, test):
        # Use different random seeds for train and test envs
        process_seed = process_seeds[process_idx]
        env_seed = 2**31 - 1 - process_seed if test else process_seed
        env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(args.env),
                                           episode_life=not test,
                                           clip_rewards=not test)
        env.seed(int(env_seed))
        if test:
            # Randomize actions like epsilon-greedy in evaluation as well
            env = chainerrl.wrappers.RandomizeAction(env, 0.05)
        if args.monitor:
            env = gym.wrappers.Monitor(
                env, args.outdir, mode='evaluation' if test else 'training')
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    sample_env = make_env(0, test=False)
    action_space = sample_env.action_space
    assert isinstance(action_space, spaces.Discrete)

    # Define a model and its optimizer
    q_func = links.Sequence(links.NIPSDQNHead(), L.Linear(256, action_space.n),
                            DiscreteActionValue)
    opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=1e-1, alpha=0.99)
    opt.setup(q_func)

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    # Make process-specific agents to diversify exploration
    def make_agent(process_idx):
        # Random epsilon assignment described in the original paper
        rand = random.random()
        if rand < 0.4:
            epsilon_target = 0.1
        elif rand < 0.7:
            epsilon_target = 0.01
        else:
            epsilon_target = 0.5
        explorer = explorers.LinearDecayEpsilonGreedy(
            1, epsilon_target, args.final_exploration_frames,
            action_space.sample)
        # Suppress the explorer logger
        explorer.logger.setLevel(logging.INFO)
        return nsq.NSQ(q_func,
                       opt,
                       t_max=5,
                       gamma=0.99,
                       i_target=40000,
                       explorer=explorer,
                       phi=phi)

    if args.demo:
        env = make_env(0, True)
        agent = make_agent(0)
        eval_stats = experiments.eval_performance(env=env,
                                                  agent=agent,
                                                  n_runs=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            agent.optimizer.lr = value

        lr_decay_hook = experiments.LinearInterpolationHook(
            args.steps, args.lr, 0, lr_setter)

        experiments.train_agent_async(
            outdir=args.outdir,
            processes=args.processes,
            make_env=make_env,
            make_agent=make_agent,
            profile=args.profile,
            steps=args.steps,
            eval_n_runs=args.eval_n_runs,
            eval_interval=args.eval_interval,
            max_episode_len=args.max_episode_len,
            global_step_hooks=[lr_decay_hook],
            save_best_so_far_agent=False,
        )
コード例 #29
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('processes', type=int)
    parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--t-max', type=int, default=5)
    parser.add_argument('--replay-start-size', type=int, default=10000)
    parser.add_argument('--n-times-replay', type=int, default=4)
    parser.add_argument('--beta', type=float, default=1e-2)
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--steps', type=int, default=10**7)
    parser.add_argument(
        '--max-frames',
        type=int,
        default=30 * 60 * 60,  # 30 minutes with 60 fps
        help='Maximum number of frames for each episode.')
    parser.add_argument('--lr', type=float, default=7e-4)
    parser.add_argument('--eval-interval', type=int, default=10**5)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--weight-decay', type=float, default=0.0)
    parser.add_argument('--use-lstm', action='store_true')
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--logging-level',
                        type=int,
                        default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--render',
                        action='store_true',
                        default=False,
                        help='Render env states in a GUI window.')
    parser.add_argument('--monitor',
                        action='store_true',
                        default=False,
                        help='Monitor env. Videos and additional information'
                        ' are saved as output files.')
    parser.set_defaults(use_lstm=False)
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    # If you use more than one processes, the results will be no longer
    # deterministic even with the same random seed.
    misc.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.processes) + args.seed * args.processes
    assert process_seeds.max() < 2**31

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    n_actions = gym.make(args.env).action_space.n

    if args.use_lstm:
        model = acer.ACERSharedModel(
            shared=links.Sequence(links.NIPSDQNHead(), L.LSTM(256, 256)),
            pi=links.Sequence(L.Linear(256, n_actions), SoftmaxDistribution),
            q=links.Sequence(L.Linear(256, n_actions), DiscreteActionValue),
        )
    else:
        model = acer.ACERSharedModel(
            shared=links.NIPSDQNHead(),
            pi=links.Sequence(L.Linear(256, n_actions), SoftmaxDistribution),
            q=links.Sequence(L.Linear(256, n_actions), DiscreteActionValue),
        )
    opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=4e-3, alpha=0.99)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))
    replay_buffer = EpisodicReplayBuffer(10**6 // args.processes)

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    agent = acer.ACER(model,
                      opt,
                      t_max=args.t_max,
                      gamma=0.99,
                      replay_buffer=replay_buffer,
                      n_times_replay=args.n_times_replay,
                      replay_start_size=args.replay_start_size,
                      beta=args.beta,
                      phi=phi)

    if args.load:
        agent.load(args.load)

    def make_env(process_idx, test):
        # Use different random seeds for train and test envs
        process_seed = process_seeds[process_idx]
        env_seed = 2**31 - 1 - process_seed if test else process_seed
        env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(
            args.env, max_frames=args.max_frames),
                                           episode_life=not test,
                                           clip_rewards=not test)
        env.seed(int(env_seed))
        if args.monitor:
            env = gym.wrappers.Monitor(
                env, args.outdir, mode='evaluation' if test else 'training')
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    if args.demo:
        env = make_env(0, True)
        eval_stats = experiments.eval_performance(env=env,
                                                  agent=agent,
                                                  n_runs=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:

        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            agent.optimizer.lr = value

        lr_decay_hook = experiments.LinearInterpolationHook(
            args.steps, args.lr, 0, lr_setter)

        experiments.train_agent_async(
            agent=agent,
            outdir=args.outdir,
            processes=args.processes,
            make_env=make_env,
            profile=args.profile,
            steps=args.steps,
            eval_n_runs=args.eval_n_runs,
            eval_interval=args.eval_interval,
            global_step_hooks=[lr_decay_hook],
            save_best_so_far_agent=False,
        )
コード例 #30
0
def main(args):
    import logging
    logging.basicConfig(level=logging.INFO, filename='log')

    if (type(args) is list):
        args = make_args(args)
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)

    # Set a random seed used in ChainerRL.
    # If you use more than one processes, the results will be no longer
    # deterministic even with the same random seed.
    misc.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.processes) + args.seed * args.processes
    assert process_seeds.max() < 2**31

    n_actions = gym.make(args.env).action_space.n

    if args.use_lstm:
        model = acer.ACERSharedModel(
            shared=links.Sequence(links.NIPSDQNHead(), L.LSTM(256, 256)),
            pi=links.Sequence(L.Linear(256, n_actions), SoftmaxDistribution),
            q=links.Sequence(L.Linear(256, n_actions), DiscreteActionValue),
        )
    else:
        model = acer.ACERSharedModel(
            shared=links.NIPSDQNHead(),
            pi=links.Sequence(L.Linear(256, n_actions), SoftmaxDistribution),
            q=links.Sequence(L.Linear(256, n_actions), DiscreteActionValue),
        )
    opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=4e-3, alpha=0.99)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))
    replay_buffer = EpisodicReplayBuffer(10**6 // args.processes)

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    agent = acer.ACER(model,
                      opt,
                      t_max=args.t_max,
                      gamma=0.99,
                      replay_buffer=replay_buffer,
                      n_times_replay=args.n_times_replay,
                      replay_start_size=args.replay_start_size,
                      beta=args.beta,
                      phi=phi)

    def make_env(process_idx, test):
        # Use different random seeds for train and test envs
        process_seed = process_seeds[process_idx]
        env_seed = 2**31 - 1 - process_seed if test else process_seed
        env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(
            args.env, max_frames=args.max_frames),
                                           episode_life=not test,
                                           clip_rewards=not test)
        env.seed(int(env_seed))
        if args.monitor:
            env = chainerrl.wrappers.Monitor(
                env, args.outdir, mode='evaluation' if test else 'training')
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    def make_env_check():
        # Use different random seeds for train and test envs
        env_seed = args.seed
        env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(
            args.env, max_frames=args.max_frames),
                                           episode_life=True,
                                           clip_rewards=True)
        env.seed(int(env_seed))
        return env

    if args.load_agent:
        agent.load(args.load_agent)

    if (args.mode == 'train'):

        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            agent.optimizer.lr = value

        lr_decay_hook = experiments.LinearInterpolationHook(
            args.steps, args.lr, 0, lr_setter)

        experiments.train_agent_async(
            agent=agent,
            outdir=args.outdir,
            processes=args.processes,
            make_env=make_env,
            profile=args.profile,
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            global_step_hooks=[lr_decay_hook],
            save_best_so_far_agent=False,
        )
    elif (args.mode == 'check'):
        return tools.make_video.check(env=make_env_check(),
                                      agent=agent,
                                      save_mp4=args.save_mp4)

    elif (args.mode == 'growth'):
        return tools.make_video.growth(env=make_env_check(),
                                       agent=agent,
                                       outdir=args.outdir,
                                       max_num=args.max_frames,
                                       save_mp4=args.save_mp4)