Example #1
0
    def rl_agent(self, env):

        self.policy = chainer.Sequential(
            L.Linear(None, 256),
            F.tanh,
            L.Linear(None, 128),
            F.tanh,
            # L.Linear(None, env.action_space.low.size, initialW=winit_last),
            L.Linear(None, env.action_space.low.size),
            # F.sigmoid,
            chainerrl.policies.GaussianHeadWithStateIndependentCovariance(
                action_size=env.action_space.low.size,
                var_type='diagonal',
                var_func=lambda x: F.exp(2 * x),  # Parameterize log std
                # var_param_init=0,  # log std = 0 => std = 1
            ))

        self.vf = chainer.Sequential(
            L.Linear(None, 256),
            F.tanh,
            L.Linear(None, 128),
            F.tanh,
            L.Linear(None, 1),
        )

        # Combine a policy and a value function into a single model
        self.model = chainerrl.links.Branched(self.policy, self.vf)

        self.opt = chainer.optimizers.Adam(alpha=3e-4, eps=1e-5)
        self.opt.setup(self.model)

        self.agent = PPO(
            self.model,
            self.opt,
            # obs_normalizer=obs_normalizer,
            gpu=-1,
            update_interval=512,
            minibatch_size=8,
            clip_eps_vf=None,
            entropy_coef=0.001,
            # standardize_advantages=args.standardize_advantages,
        )

        return self.agent
Example #2
0
def main():
    import logging

    parser = argparse.ArgumentParser()
    parser.add_argument('algo', default='ppo', choices=['ppo', 'gail', 'airl'], type=str)
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--env', type=str, default='Hopper-v2')
    parser.add_argument('--arch', type=str, default='FFGaussian',
                        choices=('FFSoftmax', 'FFMellowmax',
                                 'FFGaussian'))
    parser.add_argument('--bound-mean', action='store_true')
    parser.add_argument('--seed', type=int, default=0,
                        help='Random seed [0, 2 ** 32)')
    parser.add_argument('--outdir', type=str, default='results',
                        help='Directory path to save output files.'
                             ' If it does not exist, it will be created.')
    parser.add_argument('--steps', type=int, default=10 ** 6)
    parser.add_argument('--eval-interval', type=int, default=10000)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--reward-scale-factor', type=float, default=1e-2)
    parser.add_argument('--standardize-advantages', action='store_true')
    parser.add_argument('--render', action='store_true', default=False)
    parser.add_argument('--lr', type=float, default=3e-4)
    parser.add_argument('--weight-decay', type=float, default=0.0)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--load_demo', type=str, default='')
    parser.add_argument('--logger-level', type=int, default=logging.DEBUG)
    parser.add_argument('--monitor', action='store_true')

    parser.add_argument('--update-interval', type=int, default=2048)
    parser.add_argument('--batchsize', type=int, default=64)
    parser.add_argument('--epochs', type=int, default=10)
    parser.add_argument('--entropy-coef', type=float, default=0.0)
    args = parser.parse_args()

    logging.basicConfig(level=args.logger_level)

    # Set a random seed used in ChainerRL
    misc.set_random_seed(args.seed, gpus=(args.gpu,))

    if not (args.demo and args.load):
        args.outdir = experiments.prepare_output_dir(args, args.outdir)

    def make_env(test):
        env = gym.make(args.env)
        # Use different random seeds for train and test envs
        env_seed = 2 ** 32 - 1 - args.seed if test else args.seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if args.monitor:
            env = gym.wrappers.Monitor(env, args.outdir)
        if not test:
            # Scale rewards (and thus returns) to a reasonable range so that
            # training is easier
            env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor)
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    sample_env = gym.make(args.env)
    timestep_limit = sample_env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = sample_env.observation_space
    action_space = sample_env.action_space

    # Normalize observations based on their empirical mean and variance
    obs_normalizer = chainerrl.links.EmpiricalNormalization(
        obs_space.low.size, clip_threshold=5)

    # Switch policy types accordingly to action space types
    if args.arch == 'FFSoftmax':
        model = A3CFFSoftmax(obs_space.low.size, action_space.n)
    elif args.arch == 'FFMellowmax':
        model = A3CFFMellowmax(obs_space.low.size, action_space.n)
    elif args.arch == 'FFGaussian':
        model = A3CFFGaussian(obs_space.low.size, action_space,
                              bound_mean=args.bound_mean)

    opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5)
    opt.setup(model)
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))
    if args.algo == 'ppo':
        agent = PPO(model, opt,
                    obs_normalizer=obs_normalizer,
                    gpu=args.gpu,
                    update_interval=args.update_interval,
                    minibatch_size=args.batchsize, epochs=args.epochs,
                    clip_eps_vf=None, entropy_coef=args.entropy_coef,
                    standardize_advantages=args.standardize_advantages,
                    )
    elif args.algo == 'gail':
        import numpy as np
        from irl.gail import GAIL
        from irl.gail import Discriminator
        demonstrations = np.load(args.load_demo)
        D = Discriminator(gpu=args.gpu)
        agent = GAIL(demonstrations=demonstrations, discriminator=D,
                     model=model, optimizer=opt,
                     obs_normalizer=obs_normalizer,
                     gpu=args.gpu,
                     update_interval=args.update_interval,
                     minibatch_size=args.batchsize, epochs=args.epochs,
                     clip_eps_vf=None, entropy_coef=args.entropy_coef,
                     standardize_advantages=args.standardize_advantages,)
    elif args.algo == 'airl':
        import numpy as np
        from irl.airl import AIRL as Agent
        from irl.airl import Discriminator
        # obs_normalizer = None
        demonstrations = np.load(args.load_demo)
        D = Discriminator(gpu=args.gpu)
        agent = Agent(demonstrations=demonstrations, discriminator=D,
                      model=model, optimizer=opt,
                      obs_normalizer=obs_normalizer,
                      gpu=args.gpu,
                      update_interval=args.update_interval,
                      minibatch_size=args.batchsize, epochs=args.epochs,
                      clip_eps_vf=None, entropy_coef=args.entropy_coef,
                      standardize_advantages=args.standardize_advantages,)

    if args.load:
        agent.load(args.load)

    if args.demo:
        env = make_env(True)
        eval_stats = experiments.eval_performance(
            env=env,
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
        outdir = args.load if args.load else args.outdir
        save_agent_demo(make_env(False), agent, outdir)
    else:
        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            agent.optimizer.alpha = value

        lr_decay_hook = experiments.LinearInterpolationHook(
            args.steps, args.lr, 0, lr_setter)

        # Linearly decay the clipping parameter to zero
        def clip_eps_setter(env, agent, value):
            agent.clip_eps = max(value, 1e-8)

        clip_eps_decay_hook = experiments.LinearInterpolationHook(
            args.steps, 0.2, 0, clip_eps_setter)

        experiments.train_agent_with_evaluation(
            agent=agent,
            env=make_env(False),
            eval_env=make_env(True),
            outdir=args.outdir,
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            train_max_episode_len=timestep_limit,
            save_best_so_far_agent=False,
            step_hooks=[
                lr_decay_hook,
                clip_eps_decay_hook,
            ],
        )
        save_agent_demo(make_env(False), agent, args.outdir)
Example #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env',
                        type=str,
                        default='BreakoutNoFrameskip-v4',
                        help='Gym Env ID.')
    parser.add_argument('--gpu',
                        type=int,
                        default=0,
                        help='GPU device ID. Set to -1 to use CPUs only.')
    parser.add_argument('--num-envs',
                        type=int,
                        default=8,
                        help='Number of env instances run in parallel.')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 32)')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--steps',
                        type=int,
                        default=10**7,
                        help='Total time steps for training.')
    parser.add_argument(
        '--max-frames',
        type=int,
        default=30 * 60 * 60,  # 30 minutes with 60 fps
        help='Maximum number of frames for each episode.')
    parser.add_argument('--lr',
                        type=float,
                        default=2.5e-4,
                        help='Learning rate.')
    parser.add_argument('--eval-interval',
                        type=int,
                        default=100000,
                        help='Interval (in timesteps) between evaluation'
                        ' phases.')
    parser.add_argument('--eval-n-runs',
                        type=int,
                        default=10,
                        help='Number of episodes ran in an evaluation phase.')
    parser.add_argument('--demo',
                        action='store_true',
                        default=False,
                        help='Run demo episodes, not training.')
    parser.add_argument('--load',
                        type=str,
                        default='',
                        help='Directory path to load a saved agent data from'
                        ' if it is a non-empty string.')
    parser.add_argument('--logging-level',
                        type=int,
                        default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--render',
                        action='store_true',
                        default=False,
                        help='Render env states in a GUI window.')
    parser.add_argument('--monitor',
                        action='store_true',
                        default=False,
                        help='Monitor env. Videos and additional information'
                        ' are saved as output files.')
    parser.add_argument('--update-interval',
                        type=int,
                        default=128 * 8,
                        help='Interval (in timesteps) between PPO iterations.')
    parser.add_argument('--batchsize',
                        type=int,
                        default=32 * 8,
                        help='Size of minibatch (in timesteps).')
    parser.add_argument('--epochs',
                        type=int,
                        default=4,
                        help='Number of epochs used for each PPO iteration.')
    parser.add_argument('--log-interval',
                        type=int,
                        default=10000,
                        help='Interval (in timesteps) of printing logs.')
    parser.add_argument('--recurrent',
                        action='store_true',
                        default=False,
                        help='Use a recurrent model. See the code for the'
                        ' model definition.')
    parser.add_argument('--flicker',
                        action='store_true',
                        default=False,
                        help='Use so-called flickering Atari, where each'
                        ' screen is blacked out with probability 0.5.')
    parser.add_argument('--no-frame-stack',
                        action='store_true',
                        default=False,
                        help='Disable frame stacking so that the agent can'
                        ' only see the current screen.')
    parser.add_argument('--checkpoint-frequency',
                        type=int,
                        default=None,
                        help='Frequency at which agents are stored.')
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs
    assert process_seeds.max() < 2**32

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    def make_env(idx, test):
        # Use different random seeds for train and test envs
        process_seed = int(process_seeds[idx])
        env_seed = 2**32 - 1 - process_seed if test else process_seed
        env = atari_wrappers.wrap_deepmind(
            atari_wrappers.make_atari(args.env, max_frames=args.max_frames),
            episode_life=not test,
            clip_rewards=not test,
            flicker=args.flicker,
            frame_stack=not args.no_frame_stack,
        )
        env.seed(env_seed)
        if args.monitor:
            env = chainerrl.wrappers.Monitor(
                env, args.outdir, mode='evaluation' if test else 'training')
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    def make_batch_env(test):
        return chainerrl.envs.MultiprocessVectorEnv([
            (lambda: make_env(idx, test))
            for idx, env in enumerate(range(args.num_envs))
        ])

    sample_env = make_env(0, test=False)
    print('Observation space', sample_env.observation_space)
    print('Action space', sample_env.action_space)
    n_actions = sample_env.action_space.n

    winit_last = chainer.initializers.LeCunNormal(1e-2)
    if args.recurrent:
        model = chainerrl.links.StatelessRecurrentSequential(
            L.Convolution2D(None, 32, 8, stride=4), F.relu,
            L.Convolution2D(None, 64, 4, stride=2), F.relu,
            L.Convolution2D(None, 64, 3, stride=1), F.relu,
            L.Linear(None, 512), F.relu, L.NStepGRU(1, 512, 512, 0),
            chainerrl.links.Branched(
                chainer.Sequential(
                    L.Linear(None, n_actions, initialW=winit_last),
                    chainerrl.distribution.SoftmaxDistribution,
                ),
                L.Linear(None, 1),
            ))
    else:
        model = chainer.Sequential(
            L.Convolution2D(None, 32, 8, stride=4), F.relu,
            L.Convolution2D(None, 64, 4, stride=2), F.relu,
            L.Convolution2D(None, 64, 3, stride=1), F.relu,
            L.Linear(None, 512), F.relu,
            chainerrl.links.Branched(
                chainer.Sequential(
                    L.Linear(None, n_actions, initialW=winit_last),
                    chainerrl.distribution.SoftmaxDistribution,
                ),
                L.Linear(None, 1),
            ))

    # Draw the computational graph and save it in the output directory.
    fake_obss = np.zeros(sample_env.observation_space.shape,
                         dtype=np.float32)[None]
    if args.recurrent:
        fake_out, _ = model(fake_obss, None)
    else:
        fake_out = model(fake_obss)
    chainerrl.misc.draw_computational_graph([fake_out],
                                            os.path.join(args.outdir, 'model'))

    opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(0.5))

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    agent = PPO(
        model,
        opt,
        gpu=args.gpu,
        phi=phi,
        update_interval=args.update_interval,
        minibatch_size=args.batchsize,
        epochs=args.epochs,
        clip_eps=0.1,
        clip_eps_vf=None,
        standardize_advantages=True,
        entropy_coef=1e-2,
        recurrent=args.recurrent,
    )
    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=make_batch_env(test=True),
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev: {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        step_hooks = []

        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            agent.optimizer.alpha = value

        step_hooks.append(
            experiments.LinearInterpolationHook(args.steps, args.lr, 0,
                                                lr_setter))

        experiments.train_agent_batch_with_evaluation(
            agent=agent,
            env=make_batch_env(False),
            eval_env=make_batch_env(True),
            outdir=args.outdir,
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            checkpoint_freq=args.checkpoint_frequency,
            eval_interval=args.eval_interval,
            log_interval=args.log_interval,
            save_best_so_far_agent=False,
            step_hooks=step_hooks,
        )
Example #4
0
    'wrapper_config.TimeLimit.max_episode_steps')
obs_space = env.observation_space
action_space = env.action_space

model = A3CFFSoftmax(obs_space.low.size, action_space.n)

opt = chainer.optimizers.Adam(alpha=lr, eps=1e-5)
opt.setup(model)

# Initialize the agent
agent = PPO(
    model,
    opt,
    gpu=gpu,
    phi=phi,
    update_interval=update_interval,
    minibatch_size=64,
    epochs=10,
    clip_eps_vf=None,
    entropy_coef=0.0,
)


# Linearly decay the learning rate to zero
def lr_setter(env, agent, value):
    agent.optimizer.alpha = value


lr_decay_hook = experiments.LinearInterpolationHook(steps, 3e-4, 0, lr_setter)

Example #5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4')
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--steps', type=int, default=10**7)
    parser.add_argument(
        '--max-episode-len',
        type=int,
        default=5 * 60 * 60 // 4,  # 5 minutes with 60/4 fps
        help='Maximum number of steps for each episode.')
    parser.add_argument('--lr', type=float, default=2.5e-4)

    parser.add_argument('--eval-interval', type=int, default=10**5)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--standardize-advantages', action='store_true')
    parser.add_argument('--weight-decay', type=float, default=0.0)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--logging-level',
                        type=int,
                        default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--render',
                        action='store_true',
                        default=False,
                        help='Render env states in a GUI window.')
    parser.add_argument('--monitor',
                        action='store_true',
                        default=False,
                        help='Monitor env. Videos and additional information'
                        ' are saved as output files.')

    # In the original paper, agent runs in 8 environments parallely
    # and samples 128 steps per environment.
    # Sample 128 * 8 steps, instead.
    parser.add_argument('--update-interval', type=int, default=128 * 8)

    parser.add_argument('--batchsize', type=int, default=32)
    parser.add_argument('--epochs', type=int, default=3)
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    # Set different random seeds for train and test envs.
    train_seed = args.seed
    test_seed = 2**31 - 1 - args.seed

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    def make_env(test):
        # Use different random seeds for train and test envs
        env_seed = test_seed if test else train_seed
        env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(args.env),
                                           episode_life=not test,
                                           clip_rewards=not test)
        env.seed(int(env_seed))
        if args.monitor:
            env = gym.wrappers.Monitor(
                env, args.outdir, mode='evaluation' if test else 'training')
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    eval_env = make_env(test=True)

    n_actions = env.action_space.n

    model = A3CFF(n_actions)
    opt = chainer.optimizers.Adam(alpha=args.lr)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    agent = PPO(
        model,
        opt,
        gpu=args.gpu,
        phi=phi,
        update_interval=args.update_interval,
        minibatch_size=args.batchsize,
        epochs=args.epochs,
        clip_eps=0.1,
        clip_eps_vf=None,
        standardize_advantages=args.standardize_advantages,
    )
    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(env=eval_env,
                                                  agent=agent,
                                                  n_runs=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev: {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            agent.optimizer.alpha = value

        lr_decay_hook = experiments.LinearInterpolationHook(
            args.steps, args.lr, 0, lr_setter)

        # Linearly decay the clipping parameter to zero
        def clip_eps_setter(env, agent, value):
            agent.clip_eps = max(value, 1e-8)

        clip_eps_decay_hook = experiments.LinearInterpolationHook(
            args.steps, 0.1, 0, clip_eps_setter)

        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            eval_env=eval_env,
            outdir=args.outdir,
            steps=args.steps,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            train_max_episode_len=args.max_episode_len,
            save_best_so_far_agent=False,
            step_hooks=[
                lr_decay_hook,
                clip_eps_decay_hook,
            ],
        )
Example #6
0
def main():
    import logging

    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--env', type=str, default='Hopper-v1')
    parser.add_argument('--arch',
                        type=str,
                        default='FFGaussian',
                        choices=('FFSoftmax', 'FFMellowmax', 'FFGaussian'))
    parser.add_argument('--normalize-obs', action='store_true')
    parser.add_argument('--bound-mean', action='store_true')
    parser.add_argument('--seed', type=int, default=None)
    parser.add_argument('--outdir', type=str, default=None)
    parser.add_argument('--steps', type=int, default=10**6)
    parser.add_argument('--eval-interval', type=int, default=10000)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--reward-scale-factor', type=float, default=1e-2)
    parser.add_argument('--standardize-advantages', action='store_true')
    parser.add_argument('--render', action='store_true', default=False)
    parser.add_argument('--lr', type=float, default=3e-4)
    parser.add_argument('--weight-decay', type=float, default=0.0)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--logger-level', type=int, default=logging.DEBUG)
    parser.add_argument('--monitor', action='store_true')

    parser.add_argument('--update-interval', type=int, default=2048)
    parser.add_argument('--batchsize', type=int, default=64)
    parser.add_argument('--epochs', type=int, default=10)
    parser.add_argument('--entropy-coef', type=float, default=0.0)
    args = parser.parse_args()

    logging.getLogger().setLevel(args.logger_level)

    if args.seed is not None:
        misc.set_random_seed(args.seed)

    args.outdir = experiments.prepare_output_dir(args, args.outdir)

    def make_env(test):
        env = gym.make(args.env)
        if args.monitor:
            env = gym.wrappers.Monitor(env, args.outdir)
        # Scale rewards observed by agents
        if args.reward_scale_factor and not test:
            misc.env_modifiers.make_reward_filtered(
                env, lambda x: x * args.reward_scale_factor)
        if args.render:
            misc.env_modifiers.make_rendered(env)
        return env

    sample_env = gym.make(args.env)
    timestep_limit = sample_env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = sample_env.observation_space
    action_space = sample_env.action_space

    # Switch policy types accordingly to action space types
    if args.arch == 'FFSoftmax':
        model = A3CFFSoftmax(obs_space.low.size, action_space.n)
    elif args.arch == 'FFMellowmax':
        model = A3CFFMellowmax(obs_space.low.size, action_space.n)
    elif args.arch == 'FFGaussian':
        model = A3CFFGaussian(obs_space.low.size,
                              action_space,
                              bound_mean=args.bound_mean,
                              normalize_obs=args.normalize_obs)

    opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5)
    opt.setup(model)
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))
    agent = PPO(
        model,
        opt,
        gpu=args.gpu,
        phi=phi,
        update_interval=args.update_interval,
        minibatch_size=args.batchsize,
        epochs=args.epochs,
        clip_eps_vf=None,
        entropy_coef=args.entropy_coef,
        standardize_advantages=args.standardize_advantages,
    )

    if args.load:
        agent.load(args.load)

    if args.demo:
        env = make_env(True)
        eval_stats = experiments.eval_performance(
            env=env,
            agent=agent,
            n_runs=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            agent.optimizer.alpha = value

        lr_decay_hook = experiments.LinearInterpolationHook(
            args.steps, args.lr, 0, lr_setter)

        # Linearly decay the clipping parameter to zero
        def clip_eps_setter(env, agent, value):
            agent.clip_eps = value

        clip_eps_decay_hook = experiments.LinearInterpolationHook(
            args.steps, 0.2, 0, clip_eps_setter)

        experiments.train_agent_with_evaluation(
            agent=agent,
            env=make_env(False),
            eval_env=make_env(True),
            outdir=args.outdir,
            steps=args.steps,
            eval_n_runs=args.eval_n_runs,
            eval_interval=args.eval_interval,
            max_episode_len=timestep_limit,
            step_hooks=[
                lr_decay_hook,
                clip_eps_decay_hook,
            ],
        )
Example #7
0
def main(args):
    import logging
    logging.basicConfig(level=logging.INFO, filename='log')

    if(type(args) is list):
        args=make_args(args)
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)

    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu,))

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs
    assert process_seeds.max() < 2 ** 32

    def make_env(idx, test):
        # Use different random seeds for train and test envs
        process_seed = int(process_seeds[idx])
        env_seed = 2 ** 32 - 1 - process_seed if test else process_seed
        env = atari_wrappers.wrap_deepmind(
            atari_wrappers.make_atari(args.env, max_frames=args.max_frames),
            episode_life=not test,
            clip_rewards=not test,
            flicker=args.flicker,
            frame_stack=not args.no_frame_stack,
        )
        env.seed(env_seed)
        if args.monitor:
            env = chainerrl.wrappers.Monitor(
                env, args.outdir,
                mode='evaluation' if test else 'training')
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    def make_env_check():
        # Use different random seeds for train and test envs
        env_seed = args.seed
        env = atari_wrappers.wrap_deepmind(
            atari_wrappers.make_atari(args.env, max_frames=args.max_frames),
            episode_life=True,
            clip_rewards=True)
        env.seed(int(env_seed))
        return env

    def make_batch_env(test):
        return chainerrl.envs.MultiprocessVectorEnv(
            [(lambda: make_env(idx, test))
             for idx, env in enumerate(range(args.num_envs))])

    sample_env = make_env(0, test=False)
    print('Observation space', sample_env.observation_space)
    print('Action space', sample_env.action_space)
    n_actions = sample_env.action_space.n

    winit_last = chainer.initializers.LeCunNormal(1e-2)
    if args.recurrent:
        model = chainerrl.links.StatelessRecurrentSequential(
            L.Convolution2D(None, 32, 8, stride=4),
            F.relu,
            L.Convolution2D(None, 64, 4, stride=2),
            F.relu,
            L.Convolution2D(None, 64, 3, stride=1),
            F.relu,
            L.Linear(None, 512),
            F.relu,
            L.NStepGRU(1, 512, 512, 0),
            chainerrl.links.Branched(
                chainer.Sequential(
                    L.Linear(None, n_actions, initialW=winit_last),
                    chainerrl.distribution.SoftmaxDistribution,
                ),
                L.Linear(None, 1),
            )
        )
    else:
        model = chainer.Sequential(
            L.Convolution2D(None, 32, 8, stride=4),
            F.relu,
            L.Convolution2D(None, 64, 4, stride=2),
            F.relu,
            L.Convolution2D(None, 64, 3, stride=1),
            F.relu,
            L.Linear(None, 512),
            F.relu,
            chainerrl.links.Branched(
                chainer.Sequential(
                    L.Linear(None, n_actions, initialW=winit_last),
                    chainerrl.distribution.SoftmaxDistribution,
                ),
                L.Linear(None, 1),
            )
        )

    # Draw the computational graph and save it in the output directory.
    fake_obss = np.zeros(
        sample_env.observation_space.shape, dtype=np.float32)[None]
    if args.recurrent:
        fake_out, _ = model(fake_obss, None)
    else:
        fake_out = model(fake_obss)
    chainerrl.misc.draw_computational_graph(
        [fake_out], os.path.join(args.outdir, 'model'))

    opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(0.5))

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    agent = PPO(
        model,
        opt,
        gpu=args.gpu,
        phi=phi,
        update_interval=args.update_interval,
        minibatch_size=args.batchsize,
        epochs=args.epochs,
        clip_eps=0.1,
        clip_eps_vf=None,
        standardize_advantages=True,
        entropy_coef=1e-2,
        recurrent=args.recurrent,
    )

    if args.load_agent:
        agent.load(args.load_agent)

    if (args.mode=='train'):
        step_hooks = []
        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            agent.optimizer.alpha = value

        step_hooks.append(
            experiments.LinearInterpolationHook(
                args.steps, args.lr, 0, lr_setter))

        experiments.train_agent_batch_with_evaluation(
            agent=agent,
            env=make_batch_env(False),
            eval_env=make_batch_env(True),
            outdir=args.outdir,
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            step_offset=args.step_offset,
            checkpoint_freq=args.checkpoint_frequency,
            eval_interval=args.eval_interval,
            log_interval=args.log_interval,
            save_best_so_far_agent=False,
            step_hooks=step_hooks,
            log_type=args.log_type
        )
    elif (args.mode=='check'):
        return tools.make_video.check(env=make_env_check(),agent=agent,save_mp4=args.save_mp4)

    elif (args.mode=='growth'):
        return tools.make_video.growth(env=make_env_check(),agent=agent,outdir=args.outdir,max_num=args.max_frames,save_mp4=args.save_mp4)
Example #8
0
    "wrapper_config.TimeLimit.max_episode_steps")
obs_space = env.observation_space
action_space = env.action_space

model = A3CFFGaussian(obs_space.low.size,
                      action_space,
                      bound_mean=False,
                      normalize_obs=False)
opt = chainer.optimizers.Adam(alpha=3e-4, eps=1e-5)
opt.setup(model)

agent = PPO(model,
            opt,
            gpu=-1,
            phi=phi,
            update_interval=2048,
            minibatch_size=64,
            epochs=10,
            clip_eps_vf=None,
            entropy_coef=0.0,
            standardize_advantages=False)

agent.load("parameters")

ACTION_MEANINGS = {
    0: 'Hip1(Torque/Velocity)',
    1: 'Knee1(Torque/Velocity)',
    2: 'Hip2(Torque/Velocity)',
    3: 'Knee2(Torque/Velocity)',
}

launch_visualizer(agent, env, ACTION_MEANINGS)
Example #9
0
    def __init__(self, args, sample_env):
        obs_space = sample_env.observation_space
        action_space = sample_env.action_space

        # Normalize observations based on their empirical mean and variance
        obs_normalizer = chainerrl.links.EmpiricalNormalization(
            obs_space.low.size, clip_threshold=5)

        # Switch policy types accordingly to action space types
        if args.arch == 'FFSoftmax':
            #model = A3CFFSoftmax(obs_space.low.size, action_space.n)
            model = A3CFFSoftmax(obs_space.low.size,
                                 sample_env.env_prop.get_softmax_layer_size(),
                                 n_hidden_channels=600,
                                 beta=cfg.beta)
        elif args.arch == 'FFMellowmax':
            model = A3CFFMellowmax(obs_space.low.size, action_space.n)
        elif args.arch == 'FFGaussian':
            model = A3CFFGaussian(obs_space.low.size,
                                  action_space,
                                  bound_mean=args.bound_mean,
                                  n_hidden_channels=cfg.n_hidden_channels)
        elif args.arch == 'FFParamSoftmax':
            model = A3CFFParamSoftmax(
                obs_space.low.size,
                sample_env.env_prop.get_pre_output_layer_size(),
                sample_env.env_prop.get_parametric_segments(),
                sample_env.env_prop.get_parametric_softmax_segments_sizes(),
                n_hidden_channels=600,
                beta=cfg.beta)
        else:
            raise NotImplementedError

        opt = chainer.optimizers.Adam(alpha=args.adam_lr, eps=1e-5)
        opt.setup(model)
        if args.weight_decay > 0:
            opt.add_hook(NonbiasWeightDecay(args.weight_decay))

        # a workaround for saving obs_normalizer
        # see https://github.com/chainer/chainerrl/issues/376
        if 'obs_normalizer' not in PPO.saved_attributes:
            PPO.saved_attributes.append('obs_normalizer')

        agent = PPO(
            model,
            opt,
            obs_normalizer=obs_normalizer,
            gpu=args.gpu,
            phi=lambda x: x.astype(np.float32, copy=False),
            gamma=args.ppo_gamma,
            lambd=args.ppo_lambda,
            update_interval=args.ppo_update_interval,
            minibatch_size=args.batchsize,
            epochs=args.epochs,
            clip_eps_vf=None,
            entropy_coef=args.entropy_coef,
            standardize_advantages=args.standardize_advantages,
        )

        if args.load:
            agent.load(args.load)

        self._agent = agent
    def __init__(self,
                 layout_config,
                 agent_params,
                 train,
                 finger_two,
                 verbose=False):
        self.logger = logging.getLogger(__name__)

        self.layout_config = layout_config
        self.agent_params = agent_params
        self.train_model = train
        self.finger_two = finger_two
        self.verbose = verbose

        if finger_two:
            self.env = SupervisorEnvironment_(self.layout_config,
                                              self.agent_params,
                                              self.train_model)
        else:
            self.env = SupervisorEnvironment(self.layout_config,
                                             self.agent_params,
                                             self.train_model)

        optimizer_name = 'Adam' if agent_params is None else agent_params[
            'supervisor']['optimizer_name']
        lr = 0.001 if agent_params is None else agent_params['supervisor'][
            'learning_rate']
        n_units = 512 if agent_params is None else int(
            agent_params['supervisor']['n_units'])
        device_id = 0 if agent_params is None else int(
            agent_params['supervisor']['device_id'])
        pre_load = False if agent_params is None else bool(
            agent_params['supervisor']['pre_load'])
        self.gpu = True if agent_params is None else bool(
            agent_params['supervisor']['gpu'])
        self.save_path = path.join('data', 'models', 'supervisor') if agent_params is None \
            else agent_params['supervisor']['save_path']
        self.episodes = 1000000 if agent_params is None else int(
            agent_params['supervisor']['episodes'])
        self.log_interval = 1000 if agent_params is None else int(
            agent_params['supervisor']['log_interval'])
        self.log_filename = agent_params['supervisor']['log_file']

        winit_last = chainer.initializers.LeCunNormal(1e-2)

        self.model = chainer.Sequential(
            L.Linear(None, n_units), F.relu, L.Linear(None, n_units), F.relu,
            chainerrl.links.Branched(
                chainer.Sequential(
                    L.Linear(None,
                             self.env.action_space.n,
                             initialW=winit_last),
                    chainerrl.distribution.SoftmaxDistribution,
                ), L.Linear(None, 1)))

        if pre_load:
            serializers.load_npz(
                path.join(self.save_path, 'best', 'model.npz'), self.model)

        if self.gpu:
            self.model.to_gpu(device_id)

        if optimizer_name == 'Adam':
            self.optimizer = chainer.optimizers.Adam(alpha=lr)
        elif optimizer_name == 'RMSprop':
            self.optimizer = chainer.optimizers.RMSprop(lr=lr)
        else:
            self.optimizer = chainer.optimizers.MomentumSGD(lr=lr)

        self.optimizer.setup(self.model)

        self.optimizer.add_hook(chainer.optimizer.GradientClipping(1.0))

        phi = lambda x: x.astype(np.float32, copy=False)

        self.agent = PPO(
            self.model,
            self.optimizer,
            phi=phi,
            update_interval=1000,
            standardize_advantages=True,
            entropy_coef=1e-2,
            recurrent=False,
        )

        if train:
            chainer.config.train = True
            if self.verbose:
                self.pbar = tqdm.tqdm(total=self.episodes,
                                      ascii=True,
                                      bar_format='{l_bar}{n}, {remaining}\n')
            else:
                self.pbar = tqdm.tqdm(total=self.episodes)
        else:
            chainer.config.train = False
            self.agent.act_deterministically = False
def main():
    import logging

    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--env', type=str, default='Hopper-v2')
    parser.add_argument('--num-envs', type=int, default=1)
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 32)')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--steps', type=int, default=10**6)
    parser.add_argument('--eval-interval', type=int, default=10000)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--reward-scale-factor', type=float, default=1e-2)
    parser.add_argument('--standardize-advantages', action='store_true')
    parser.add_argument('--render', action='store_true', default=False)
    parser.add_argument('--lr', type=float, default=3e-4)
    parser.add_argument('--weight-decay', type=float, default=0.0)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--logger-level', type=int, default=logging.DEBUG)
    parser.add_argument('--monitor', action='store_true')
    parser.add_argument('--window-size', type=int, default=100)

    parser.add_argument('--update-interval', type=int, default=2048)
    parser.add_argument('--log-interval', type=int, default=1000)
    parser.add_argument('--batchsize', type=int, default=64)
    parser.add_argument('--epochs', type=int, default=10)
    parser.add_argument('--entropy-coef', type=float, default=0.0)
    args = parser.parse_args()

    logging.basicConfig(level=args.logger_level)

    # Set a random seed used in ChainerRL
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs
    assert process_seeds.max() < 2**32

    args.outdir = experiments.prepare_output_dir(args, args.outdir)

    def make_env(process_idx, test):
        env = gym.make(args.env)
        # Use different random seeds for train and test envs
        process_seed = int(process_seeds[process_idx])
        env_seed = 2**32 - 1 - process_seed if test else process_seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if args.monitor:
            env = chainerrl.wrappers.Monitor(env, args.outdir)
        if not test:
            # Scale rewards (and thus returns) to a reasonable range so that
            # training is easier
            env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor)
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    def make_batch_env(test):
        return chainerrl.envs.MultiprocessVectorEnv([
            functools.partial(make_env, idx, test)
            for idx, env in enumerate(range(args.num_envs))
        ])

    # Only for getting timesteps, and obs-action spaces
    sample_env = gym.make(args.env)
    timestep_limit = sample_env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = sample_env.observation_space
    action_space = sample_env.action_space

    # Normalize observations based on their empirical mean and variance
    obs_normalizer = chainerrl.links.EmpiricalNormalization(obs_space.low.size,
                                                            clip_threshold=5)

    winit_last = chainer.initializers.LeCunNormal(1e-2)

    # Switch policy types accordingly to action space types
    if isinstance(action_space, gym.spaces.Discrete):
        n_actions = action_space.n
        policy = chainer.Sequential(
            L.Linear(None, 64),
            F.tanh,
            L.Linear(None, 64),
            F.tanh,
            L.Linear(None, n_actions, initialW=winit_last),
            chainerrl.distribution.SoftmaxDistribution,
        )
    elif isinstance(action_space, gym.spaces.Box):
        action_size = action_space.low.size
        policy = chainer.Sequential(
            L.Linear(None, 64),
            F.tanh,
            L.Linear(None, 64),
            F.tanh,
            L.Linear(None, action_size, initialW=winit_last),
            chainerrl.policies.GaussianHeadWithStateIndependentCovariance(
                action_size=action_size,
                var_type='diagonal',
                var_func=lambda x: F.exp(2 * x),  # Parameterize log std
                var_param_init=0,  # log std = 0 => std = 1
            ),
        )
    else:
        print("""\
This example only supports gym.spaces.Box or gym.spaces.Discrete action spaces."""
              )  # NOQA
        return

    vf = chainer.Sequential(
        L.Linear(None, 64),
        F.tanh,
        L.Linear(None, 64),
        F.tanh,
        L.Linear(None, 1),
    )

    # Combine a policy and a value function into a single model
    model = chainerrl.links.Branched(policy, vf)

    opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5)
    opt.setup(model)
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))
    agent = PPO(
        model,
        opt,
        obs_normalizer=obs_normalizer,
        gpu=args.gpu,
        update_interval=args.update_interval,
        minibatch_size=args.batchsize,
        epochs=args.epochs,
        clip_eps_vf=None,
        entropy_coef=args.entropy_coef,
        standardize_advantages=args.standardize_advantages,
    )

    if args.load:
        agent.load(args.load)

    if args.demo:
        env = make_batch_env(True)
        eval_stats = experiments.eval_performance(
            env=env,
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            agent.optimizer.alpha = value

        lr_decay_hook = experiments.LinearInterpolationHook(
            args.steps, args.lr, 0, lr_setter)

        experiments.train_agent_batch_with_evaluation(
            agent=agent,
            env=make_batch_env(False),
            eval_env=make_batch_env(True),
            outdir=args.outdir,
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            log_interval=args.log_interval,
            return_window_size=args.window_size,
            max_episode_len=timestep_limit,
            save_best_so_far_agent=False,
            step_hooks=[
                lr_decay_hook,
            ],
        )
class SupervisorAgent(Agent):
    def __init__(self,
                 layout_config,
                 agent_params,
                 train,
                 finger_two,
                 verbose=False):
        self.logger = logging.getLogger(__name__)

        self.layout_config = layout_config
        self.agent_params = agent_params
        self.train_model = train
        self.finger_two = finger_two
        self.verbose = verbose

        if finger_two:
            self.env = SupervisorEnvironment_(self.layout_config,
                                              self.agent_params,
                                              self.train_model)
        else:
            self.env = SupervisorEnvironment(self.layout_config,
                                             self.agent_params,
                                             self.train_model)

        optimizer_name = 'Adam' if agent_params is None else agent_params[
            'supervisor']['optimizer_name']
        lr = 0.001 if agent_params is None else agent_params['supervisor'][
            'learning_rate']
        n_units = 512 if agent_params is None else int(
            agent_params['supervisor']['n_units'])
        device_id = 0 if agent_params is None else int(
            agent_params['supervisor']['device_id'])
        pre_load = False if agent_params is None else bool(
            agent_params['supervisor']['pre_load'])
        self.gpu = True if agent_params is None else bool(
            agent_params['supervisor']['gpu'])
        self.save_path = path.join('data', 'models', 'supervisor') if agent_params is None \
            else agent_params['supervisor']['save_path']
        self.episodes = 1000000 if agent_params is None else int(
            agent_params['supervisor']['episodes'])
        self.log_interval = 1000 if agent_params is None else int(
            agent_params['supervisor']['log_interval'])
        self.log_filename = agent_params['supervisor']['log_file']

        winit_last = chainer.initializers.LeCunNormal(1e-2)

        self.model = chainer.Sequential(
            L.Linear(None, n_units), F.relu, L.Linear(None, n_units), F.relu,
            chainerrl.links.Branched(
                chainer.Sequential(
                    L.Linear(None,
                             self.env.action_space.n,
                             initialW=winit_last),
                    chainerrl.distribution.SoftmaxDistribution,
                ), L.Linear(None, 1)))

        if pre_load:
            serializers.load_npz(
                path.join(self.save_path, 'best', 'model.npz'), self.model)

        if self.gpu:
            self.model.to_gpu(device_id)

        if optimizer_name == 'Adam':
            self.optimizer = chainer.optimizers.Adam(alpha=lr)
        elif optimizer_name == 'RMSprop':
            self.optimizer = chainer.optimizers.RMSprop(lr=lr)
        else:
            self.optimizer = chainer.optimizers.MomentumSGD(lr=lr)

        self.optimizer.setup(self.model)

        self.optimizer.add_hook(chainer.optimizer.GradientClipping(1.0))

        phi = lambda x: x.astype(np.float32, copy=False)

        self.agent = PPO(
            self.model,
            self.optimizer,
            phi=phi,
            update_interval=1000,
            standardize_advantages=True,
            entropy_coef=1e-2,
            recurrent=False,
        )

        if train:
            chainer.config.train = True
            if self.verbose:
                self.pbar = tqdm.tqdm(total=self.episodes,
                                      ascii=True,
                                      bar_format='{l_bar}{n}, {remaining}\n')
            else:
                self.pbar = tqdm.tqdm(total=self.episodes)
        else:
            chainer.config.train = False
            self.agent.act_deterministically = False

    def train(self, episodes):
        """
        Trains the model for given number of episodes.
        """

        progress_bar = ProgressBar(self.pbar, episodes)

        experiments.train_agent_with_evaluation(
            self.agent,
            self.env,
            steps=episodes,  # Train the agent for 2000 steps
            eval_n_steps=None,  # We evaluate for episodes, not time
            eval_n_episodes=10,  # 10 episodes are sampled for each evaluation
            train_max_episode_len=100,  # Maximum length of each episode
            eval_interval=self.
            log_interval,  # Evaluate the agent after every 1000 steps
            step_hooks=[progress_bar],  # add hooks
            logger=self.logger,
            outdir=self.save_path)  # Save everything to 'supervisor' directory

    def evaluate(self, sentence, batch, n_users, **kwargs):
        """
        Function to evaluate trained agent.
        :param sentence: sentence to type.
        :param batch: run evaluation in batch mode.
        :param n_users: number of users to simulate.
        """

        done = False
        if not (sentence == "" or sentence is None):
            self.env.sentences = [sentence]
            self.env.sentences_bkp = [sentence]

        if batch:
            sentence_agg_data = [[
                "sentence.id", "agent.id", "target.sentence", "wpm",
                "lev.distance", "gaze.shift", "bs", "immediate.bs",
                "delayed.bs", "gaze.keyboard.ratio", "fix.count",
                "finger.travel", "iki", "correct.error", "uncorrected.error",
                "fix.duration", "chunk.length"
            ]]
            if self.verbose:
                iter = tqdm.tqdm(iterable=range(n_users),
                                 ascii=True,
                                 bar_format='{l_bar}{n}, {remaining}\n')
            else:
                iter = tqdm.tqdm(range(n_users))
            for i in iter:

                if self.finger_two:
                    self.env = SupervisorEnvironment_(self.layout_config,
                                                      self.agent_params,
                                                      self.train_model)
                else:
                    self.env = SupervisorEnvironment(self.layout_config,
                                                     self.agent_params,
                                                     self.train_model)
                self.env.agent_id = i

                # reinitialise random seed.
                np.random.seed(datetime.now().microsecond)
                random.seed(datetime.now().microsecond)

                while len(self.env.sentences) > 0:
                    state = self.env.reset()
                    done = False
                    while not done:
                        action = self.agent.act(state)
                        state, reward, done, info = self.env.step(action)

                sentence_agg_data += self.env.sentence_test_data

            with open(path.join("data", "output",
                                "SupervisorAgent_sentence_test.csv"),
                      "w",
                      newline="",
                      encoding='utf-8') as f:
                writer = csv.writer(f)
                writer.writerows(sentence_agg_data)

            if not self.finger_two:
                with open(path.join("data", "output",
                                    "SupervisorAgent_Vision_Viz.csv"),
                          "w",
                          newline="") as f:
                    writer = csv.writer(f)
                    writer.writerows(self.env.eye_viz_log)

                with open(path.join("data", "output",
                                    "SupervisorAgent_Finger_Viz.csv"),
                          "w",
                          newline="") as f:
                    writer = csv.writer(f)
                    writer.writerows(self.env.finger_viz_log)

                with open(path.join("data", "output",
                                    "SupervisorAgent_Typing_Viz.csv"),
                          "w",
                          newline="") as f:
                    writer = csv.writer(f)
                    writer.writerows(self.env.typing_viz_log)

        else:
            self.env.sentence_test_data.append([
                "sentence.id", "agent.id", "target.sentence", "wpm",
                "lev.distance", "gaze.shift", "bs", "immediate.bs",
                "delayed.bs", "gaze.keyboard.ratio", "fix.count",
                "finger.travel", "iki", "correct.error", "uncorrected.error",
                "fix.duration", "chunk.length"
            ])
            state = self.env.reset()
            while not done:
                action = self.agent.act(state)
                state, reward, done, info = self.env.step(action)

            with open(path.join("data", "output",
                                "SupervisorAgent_vision_test.csv"),
                      "w",
                      newline="") as f:
                writer = csv.writer(f)
                writer.writerows(self.env.eye_test_data)

            with open(path.join("data", "output",
                                "SupervisorAgent_finger_test.csv"),
                      "w",
                      newline="") as f:
                writer = csv.writer(f)
                writer.writerows(self.env.finger_test_data)

            with open(path.join("data", "output",
                                "SupervisorAgent_sentence_test.csv"),
                      "w",
                      newline="",
                      encoding='utf-8') as f:
                writer = csv.writer(f)
                writer.writerows(self.env.sentence_test_data)

            # TODO: This is from legacy code. Need to update.
            visualise_agent(
                True, True,
                path.join("data", "output", "SupervisorAgent_vision_test.csv"),
                path.join("data", "output", "SupervisorAgent_finger_test.csv"),
                path.join("data", "output", "SupervisorAgent.mp4"))

        self.save_senetence_agg_data(
            path.join("data", "output", "SupervisorAgent_sentence_test.csv"))
        self.save_user_agg_data(
            path.join("data", "output", "SupervisorAgent_sentence_test.csv"))

    def save_senetence_agg_data(self, filename):
        """
        generates sentence level aggregate data.
        :param filename: raw data file path.
        """
        data = pd.read_csv(filename, sep=',', encoding='utf-8')
        data = data.groupby("target.sentence").agg(['mean', 'std'])
        data.to_csv(path.join("data", "output",
                              "SupervisorAgent_sentence_aggregate.csv"),
                    encoding='utf-8')

    def save_user_agg_data(self, filename):
        """
        generates user level aggregate data.
        :param filename: raw data file path.
        """
        data = pd.read_csv(filename, sep=',', encoding='utf-8')
        data = data.groupby("agent.id").agg(['mean', 'std'])
        data.to_csv(path.join("data", "output",
                              "SupervisorAgent_user_aggregate.csv"),
                    encoding='utf-8')
Example #13
0
class rl_stock_trader():
    def __init__(self):

        run_name = 'run_test'
        self.outdir = './results/' + run_name + '/'
        self.outdir_train = self.outdir + 'train/'
        self.outdir_test = self.outdir + 'test/'

        self.training_counter = 0

        try:
            sys.makedirs(self.outdir_train)
            sys.makedirs(self.outdir_test)
        except Exception:
            pass

        self.writer_train = SummaryWriter(self.outdir_train)
        self.writer_test = SummaryWriter(self.outdir_test)

        self.monitor_freq = 100
        self.testing_samples = 100

        self.validation_scores = []
        self.training_scores = []

        self.settings = {
            'past_horzion': 100,
            'max_steps': 365,
            'inital_account_balance': 1e4,
            'stop_below_balance': 1e3,
            'transation_fee': .1,
            'years_training': 5,
            'years_testing': 1,
        }

        testing_end = date.today()
        testing_beginning = testing_end - relativedelta(
            years=self.settings['years_testing']) - relativedelta(
                days=self.settings['past_horzion'])
        training_end = testing_beginning - relativedelta(days=1)
        training_beginning = training_end - relativedelta(
            years=self.settings['years_training']) - relativedelta(
                days=self.settings['past_horzion'])

        self.data = {
            'train_gold':
            self.get_prices(gold_shanghai, 1, training_beginning,
                            training_end),
            'train_copper':
            self.get_prices(copper_shanghai, 1, training_beginning,
                            training_end),
            'train_aluminum':
            self.get_prices(aluminum_shanghai, 1, training_beginning,
                            training_end),
            'test_gold':
            self.get_prices(gold_shanghai, 1, testing_beginning, testing_end),
            'test_copper':
            self.get_prices(copper_shanghai, 1, testing_beginning,
                            testing_end),
            'test_aluminum':
            self.get_prices(aluminum_shanghai, 1, testing_beginning,
                            testing_end),
            'test_soybean_oil':
            self.get_prices(soybean_oil, 1, testing_beginning, testing_end),
            'test_dax_futures':
            self.get_prices(dax_futures, 1, testing_beginning, testing_end),
            'test_corn':
            self.get_prices(corn, 1, testing_beginning, testing_end),
            'test_canadian_dollar':
            self.get_prices(canadian_dollar, 1, testing_beginning,
                            testing_end),
        }

        # print('\n\n*************\n', self.data['test_corn'], '\n\n')

        self.env_test_gold = StockTradingEnv(self.get_prices(
            gold_shanghai, 1, testing_beginning, testing_end),
                                             self.settings,
                                             test=True)
        self.env_test_copper = StockTradingEnv(self.get_prices(
            copper_shanghai, 1, testing_beginning, testing_end),
                                               self.settings,
                                               test=True)
        self.env_test_aluminum = StockTradingEnv(self.get_prices(
            aluminum_shanghai, 1, testing_beginning, testing_end),
                                                 self.settings,
                                                 test=True)
        self.env_test_soy_bean = StockTradingEnv(self.get_prices(
            soybean_oil, 1, testing_beginning, testing_end),
                                                 self.settings,
                                                 test=True)
        self.env_test_dax = StockTradingEnv(self.get_prices(
            dax_futures, 1, testing_beginning, testing_end),
                                            self.settings,
                                            test=True)
        self.env_test_corn = StockTradingEnv(self.get_prices(
            corn, 1, testing_beginning, testing_end),
                                             self.settings,
                                             test=True)
        self.env_test_canadian_dollar = StockTradingEnv(self.get_prices(
            canadian_dollar, 1, testing_beginning, testing_end),
                                                        self.settings,
                                                        test=True)

        self.env_train = StockTradingEnv(self.data['train_gold'],
                                         self.settings,
                                         test=False)
        # self.env_test = StockTradingEnv(self.data['test_gold'], self.settings, test=True)

        self.test_envs = {
            'gold':
            StockTradingEnv(self.data['test_gold'], self.settings, test=True),
            'copper':
            StockTradingEnv(self.data['test_copper'], self.settings,
                            test=True),
            'aluminum':
            StockTradingEnv(self.data['test_aluminum'],
                            self.settings,
                            test=True),
        }

        self.agent = self.rl_agent(self.env_train)

    def get_prices(self, index, depth, start, end):

        data_prices = quandl.get(index + str(depth),
                                 start_date=start,
                                 end_date=end)

        data_prices.index = pd.to_datetime(data_prices.index)

        return data_prices

    def rl_agent(self, env):

        # self.policy = chainer.Sequential(
        # 	L.BatchNormalization(axis=0),
        # 	L.Linear(None, 256),
        # 	# F.dropout(ratio=.5),
        # 	F.tanh,
        # 	L.Linear(None, 128),
        # 	# F.dropout(ratio=.5),
        # 	F.tanh,
        # 	# L.Linear(None, env.action_space.low.size, initialW=winit_last),
        # 	L.Linear(None, env.action_space.low.size),
        # 	# F.sigmoid,
        # 	chainerrl.policies.GaussianHeadWithStateIndependentCovariance(
        # 		action_size=env.action_space.low.size,
        # 		var_type='diagonal',
        # 		var_func=lambda x: F.exp(2 * x),  # Parameterize log std
        # 		# var_param_init=0,  # log std = 0 => std = 1
        # 		))

        self.policy = chainer.Sequential(
            L.BatchNormalization(axis=0),
            L.Linear(None, 256),
            # F.dropout(ratio=.5),
            F.sigmoid,
            # F.relu,
            L.Linear(None, 128),
            # F.dropout(ratio=.5),
            F.sigmoid,
            # L.Linear(None, env.action_space.low.size, initialW=winit_last),
            L.Linear(None, env.action_space.low.size),
            F.sigmoid,
            chainerrl.policies.GaussianHeadWithStateIndependentCovariance(
                action_size=env.action_space.low.size,
                var_type='diagonal',
                var_func=lambda x: F.exp(2 * x),  # Parameterize log std
                # var_param_init=0,  # log std = 0 => std = 1
            ))

        self.vf = chainer.Sequential(
            L.BatchNormalization(axis=0),
            L.Linear(None, 256),
            # F.dropout(ratio=.5),
            F.sigmoid,
            L.Linear(None, 128),
            # F.dropout(ratio=.5),
            F.sigmoid,
            L.Linear(None, 1),
            F.sigmoid,
        )

        # self.vf = chainer.Sequential(
        # 	L.BatchNormalization(axis=0),
        # 	L.Linear(None, 256),
        # 	# F.dropout(ratio=.5),
        # 	F.tanh,
        # 	L.Linear(None, 128),
        # 	# F.dropout(ratio=.5),
        # 	F.tanh,
        # 	L.Linear(None, 1),
        # )

        # Combine a policy and a value function into a single model
        self.model = chainerrl.links.Branched(self.policy, self.vf)

        self.opt = chainer.optimizers.Adam(alpha=3e-3, eps=1e-5)
        self.opt.setup(self.model)

        self.agent = PPO(
            self.model,
            self.opt,
            # obs_normalizer=obs_normalizer,
            gpu=-1,
            update_interval=64,
            minibatch_size=32,
            clip_eps_vf=None,
            entropy_coef=0.001,
            # standardize_advantages=args.standardize_advantages,
        )

        return self.agent

    def monitor_training(self, tb_writer, t, i, done, action, monitor_data,
                         counter):

        if t == 0 or i == 0:

            self.cash_dummy = []
            self.equity_dummy = []
            self.shares_dummy = []
            self.shares_value_dummy = []
            self.action_dummy = []
            self.action_prob_dummy = []

        self.cash_dummy.append(monitor_data['cash'])
        self.equity_dummy.append(monitor_data['equity'])
        self.shares_dummy.append(monitor_data['shares_held'])
        self.shares_value_dummy.append(monitor_data['value_in_shares'])
        self.action_dummy.append(monitor_data['action'])
        self.action_prob_dummy.append(monitor_data['action_prob'])

        # if done:
        # tb_writer.add_scalar('cash', np.mean(self.cash_dummy), counter)
        # tb_writer.add_scalar('equity', np.mean(self.equity_dummy), counter)
        # tb_writer.add_scalar('shares_held', np.mean(self.shares_dummy), counter)
        # tb_writer.add_scalar('shares_value', np.mean(self.shares_value_dummy), counter)
        # tb_writer.add_scalar('action', np.mean(self.action_dummy), counter)
        # tb_writer.add_histogram('action_prob', np.mean(self.action_prob_dummy), counter)

    def plot_validation_figures(self, index, name, test_data_label, benchmark):

        if name in ['mean', 'max', 'final']:
            ylimits = [.75 * np.amin(benchmark), 1.5 * np.amax(benchmark)]
        elif name == 'min':
            ylimits = [0., self.settings['inital_account_balance']]

        plotcolor = 'darkgreen'

        plt.figure(figsize=(18, 18))
        plt.scatter(
            np.asarray(self.validation_scores)[:, 0],
            np.asarray(self.validation_scores)[:, index])
        plt.grid()
        plt.ylim(ylimits[0], ylimits[1])
        plt.title(name + ' equity statistics over 1 year')
        plt.xlabel('trained episodes')
        plt.ylabel('equity [$]')
        plt.savefig(self.outdir + test_data_label + '/scatter_' + name +
                    '_equity.pdf')
        plt.close()

        area_plots = []
        box_data = []
        for j in range(len(np.unique(np.asarray(self.validation_scores)[:,
                                                                        0]))):
            dummy = np.asarray(self.validation_scores)[:, index][np.where(
                np.asarray(self.validation_scores)[:, 0] == np.unique(
                    np.asarray(self.validation_scores)[:, 0])[j])]
            box_data.append(dummy)
            area_plots.append([
                np.percentile(dummy, 5),
                np.percentile(dummy, 25),
                np.percentile(dummy, 50),
                np.percentile(dummy, 75),
                np.percentile(dummy, 95),
            ])
        area_plots = np.asarray(area_plots)

        p05 = area_plots[:, 0]
        p25 = area_plots[:, 1]
        p50 = area_plots[:, 2]
        p75 = area_plots[:, 3]
        p95 = area_plots[:, 4]

        plt.figure(figsize=(18, 18))
        plt.fill_between(np.arange(area_plots.shape[0]),
                         p05,
                         p95,
                         facecolor=plotcolor,
                         alpha=.3)
        plt.fill_between(np.arange(area_plots.shape[0]),
                         p25,
                         p75,
                         facecolor=plotcolor,
                         alpha=.8)
        plt.plot(p50, linewidth=3, color='lightblue')
        plt.ylim(ylimits[0], ylimits[1])
        plt.grid()
        plt.title(name + ' equity statistics over 1 year')
        plt.xlabel('trained episodes')
        plt.ylabel('equity [$]')
        plt.savefig(self.outdir + test_data_label + '/area_' + name +
                    '_equity.pdf')
        plt.close()

        plt.figure(figsize=(18, 18))
        plt.boxplot(
            box_data,
            notch=True,
            labels=None,
            boxprops=dict(color=plotcolor, linewidth=2),
            capprops=dict(color=plotcolor),
            whiskerprops=dict(color=plotcolor),
            flierprops=dict(color=plotcolor,
                            markeredgecolor=plotcolor,
                            markerfacecolor=plotcolor),
            medianprops=dict(color='lightblue', linewidth=2),
        )
        plt.ylim(ylimits[0], ylimits[1])
        plt.grid()
        plt.title('equity statistics over 1 year')
        plt.xlabel('trained episodes')
        plt.ylabel('equity [$]')
        plt.savefig(self.outdir + test_data_label + '/box_' + name +
                    '_equity.pdf')
        plt.close()

    def validate(self, episode, counter, test_data_label):

        try:
            os.mkdir(self.outdir + test_data_label + '/')
        except Exception:
            pass

        test_equity = []
        test_trades_buy = []
        test_trades_sell = []

        test_data = self.data['test_' + test_data_label]
        try:
            benchmark = test_data['Close'].values[self.
                                                  settings['past_horzion']:]
        except KeyError:
            benchmark = test_data['Settle'].values[self.
                                                   settings['past_horzion']:]
        benchmark /= benchmark[0]
        benchmark *= self.settings['inital_account_balance']

        plt.figure(figsize=(18, 18))

        for i in range(0, self.testing_samples):

            if test_data_label == 'gold':
                obs = self.env_test_gold.reset()
            if test_data_label == 'copper':
                obs = self.env_test_copper.reset()
            if test_data_label == 'aluminum':
                obs = self.env_test_aluminum.reset()
            if test_data_label == 'soybean_oil':
                obs = self.env_test_soy_bean.reset()
            if test_data_label == 'dax_futures':
                obs = self.env_test_dax.reset()
            if test_data_label == 'corn':
                obs = self.env_test_corn.reset()
            if test_data_label == 'corn':
                obs = self.env_test_corn.reset()
            if test_data_label == 'canadian_dollar':
                obs = self.env_test_canadian_dollar.reset()

            # obs = self.env_test.reset()

            reward = 0
            done = False
            R = 0
            t = 0

            while not done:

                action = self.agent.act(obs)

                if test_data_label == 'gold':
                    obs, reward, done, _, monitor_data = self.env_test_gold.step(
                        action)
                if test_data_label == 'copper':
                    obs, reward, done, _, monitor_data = self.env_test_copper.step(
                        action)
                if test_data_label == 'aluminum':
                    obs, reward, done, _, monitor_data = self.env_test_aluminum.step(
                        action)
                if test_data_label == 'soybean_oil':
                    obs, reward, done, _, monitor_data = self.env_test_soy_bean.step(
                        action)
                if test_data_label == 'dax_futures':
                    obs, reward, done, _, monitor_data = self.env_test_dax.step(
                        action)
                if test_data_label == 'corn':
                    obs, reward, done, _, monitor_data = self.env_test_corn.step(
                        action)
                if test_data_label == 'canadian_dollar':
                    obs, reward, done, _, monitor_data = self.env_test_canadian_dollar.step(
                        action)

                # obs, reward, done, _, monitor_data = self.env_test.step(action)

                test_equity.append(monitor_data['equity'])

                action_choice = np.argmax(softmax(action))
                action_confidence = softmax(action)[action_choice]
                if action_confidence > .8:
                    if action_choice == 0:
                        test_trades_buy.append([t, monitor_data['equity']])
                    if action_choice == 2:
                        test_trades_sell.append([t, monitor_data['equity']])

                self.monitor_training(self.writer_test, t, i, done, action,
                                      monitor_data, counter)

                R += reward
                t += 1

                if done:
                    test_equity = test_equity[:-1]

                    plt.plot(test_equity[:-1], linewidth=1)
                    # try:
                    # 	plt.scatter(np.asarray(test_trades_buy)[:,0], np.asarray(test_trades_buy)[:,1], marker='X', c='green', s=5)
                    # 	plt.scatter(np.asarray(test_trades_sell)[:,0], np.asarray(test_trades_sell)[:,1], marker='X', c='red', s=5)
                    # except IndexError:
                    # 	pass

                    self.validation_scores.append([
                        counter,
                        np.mean(test_equity),
                        np.amin(test_equity),
                        np.amax(test_equity), test_equity[-1]
                    ])
                    test_equity = []

                    self.agent.stop_episode()

        time_axis = test_data.index[self.settings['past_horzion']:].date
        time_axis_short = time_axis[::10]

        plt.plot(benchmark, linewidth=3, color='k', label='close')
        plt.ylim(.75 * np.amin(benchmark), 1.5 * np.amax(benchmark))
        plt.xticks(np.linspace(0, len(time_axis),
                               len(time_axis_short) - 1),
                   time_axis_short,
                   rotation=90)
        plt.grid()
        plt.title(test_data_label + ' validation runs at episode ' +
                  str(episode))
        plt.xlabel('episode')
        plt.ylabel('equity [$]')
        plt.legend()
        plt.savefig(self.outdir + test_data_label + '/validation_E' +
                    str(episode) + '.pdf')
        plt.close()

        self.plot_validation_figures(1, 'mean', test_data_label, benchmark)
        self.plot_validation_figures(2, 'min', test_data_label, benchmark)
        self.plot_validation_figures(3, 'max', test_data_label, benchmark)
        self.plot_validation_figures(4, 'final', test_data_label, benchmark)

    def train(self):

        print('\nstart training loop\n')

        def check_types(input, inputname):
            if np.isnan(input).any():
                print('----> ', inputname, ' array contains NaN\n',
                      np.isnan(input).shape, '\n')
            if np.isinf(input).any():
                print('----> ', inputname, ' array contains inf\n',
                      np.isinf(input).shape, '\n')

        n_episodes = int(1e5)

        log_data = []
        action_log = []

        debug_printing = False

        for i in range(0, n_episodes + 1):

            obs = self.env_train.reset()

            reward = 0
            done = False
            R = 0  # return (sum of rewards)
            t = 0  # time step

            while not done:

                # self.env.render()
                action = self.agent.act_and_train(obs, reward)

                obs, reward, done, _, monitor_data = self.env_train.step(
                    action)

                self.monitor_training(self.writer_train, t, i, done, action,
                                      monitor_data, self.training_counter)

                R += reward
                t += 1

                if t % 10 == 0 and not done:
                    log_data.append({
                        'equity':
                        int(monitor_data['equity']),
                        'shares_held':
                        int(monitor_data['shares_held']),
                        'shares_value':
                        int(monitor_data['value_in_shares']),
                        'cash':
                        int(monitor_data['cash']),
                        't':
                        int(t),
                    })
                    action_log.append([
                        self.training_counter, action[0], action[1], action[2]
                    ])

                if done:
                    if i % 10 == 0:
                        print('\nrollout ' + str(i) + '\n',
                              pd.DataFrame(log_data).max())
                    log_data = []
                    self.training_scores.append([i, R])
                    self.training_counter += 1

            self.agent.stop_episode()

            if i % self.monitor_freq == 0:

                # self.agent.stop_episode_and_train(obs, reward, done)

                # print('\n\nvalidation...')
                self.validate(i, self.training_counter, 'gold')
                if debug_printing: print('\n\n****************\nSOY BEANS\n\n')
                self.validate(i, self.training_counter, 'soybean_oil')
                if debug_printing: print('\n\n****************\nCORN\n\n')
                self.validate(i, self.training_counter, 'corn')
                # if debug_printing: print('\n\n****************\nCANADIAN DOLLAR\n\n')
                # self.validate(i, self.training_counter, 'canadian_dollar')

                if debug_printing: print('\n****************\n')

                act_probs = softmax(np.asarray(action_log)[:, 1:], axis=1)

                plt.figure()
                plt.scatter(np.asarray(self.training_scores)[:, 0],
                            np.asarray(self.training_scores)[:, 1],
                            s=2,
                            label='reward')
                plt.legend()
                plt.title('reward')
                plt.grid()
                plt.savefig(self.outdir + 'reward.pdf')
                plt.close()

                plt.figure()
                plt.scatter(np.asarray(action_log)[:, 0],
                            act_probs[:, 0],
                            label='action0')
                plt.scatter(np.asarray(action_log)[:, 0],
                            act_probs[:, 1],
                            label='action1')
                plt.scatter(np.asarray(action_log)[:, 0],
                            act_probs[:, 2],
                            label='action2')
                plt.legend()
                plt.title('actions')
                plt.grid()
                plt.savefig(self.outdir + 'actions.pdf')
                plt.close()

                plt.figure()
                plt.plot(np.asarray(action_log)[:, 0],
                         act_probs[:, 0],
                         label='action0')
                plt.plot(np.asarray(action_log)[:, 0],
                         act_probs[:, 1],
                         label='action1')
                plt.plot(np.asarray(action_log)[:, 0],
                         act_probs[:, 2],
                         label='action2')
                plt.legend()
                plt.title('actions')
                plt.grid()
                plt.savefig(self.outdir + 'actions_plot.pdf')
                plt.close()

            if i % 10 == 0 and i > 0:

                self.agent.save(self.outdir)

                serializers.save_npz(self.outdir + 'model.npz', self.model)

            # if i % 1000 == 0:
            #     print('\nepisode:', i, ' | episode length: ', t, '\nreward:', R,
            #           '\nstatistics:', self.agent.get_statistics(), '\n')

        self.agent.stop_episode_and_train(obs, reward, done)
        print('Finished.')
Example #14
0
def main():
    import logging

    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu', type=int, default=-1)
    parser.add_argument('--env', type=str, default='Hopper-v2')
    parser.add_argument('--num-envs', type=int, default=1)
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 32)')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--steps', type=int, default=10**6)
    parser.add_argument('--eval-interval', type=int, default=10000)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--reward-scale-factor', type=float, default=1e-2)
    parser.add_argument('--standardize-advantages', action='store_true')
    parser.add_argument('--render', action='store_true', default=False)
    parser.add_argument('--lr', type=float, default=3e-4)
    parser.add_argument('--weight-decay', type=float, default=0.0)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--logger-level', type=int, default=logging.DEBUG)
    parser.add_argument('--monitor', action='store_true')
    parser.add_argument('--window-size', type=int, default=100)

    parser.add_argument('--update-interval', type=int, default=2048)
    parser.add_argument('--log-interval', type=int, default=1000)
    parser.add_argument('--batchsize', type=int, default=64)
    parser.add_argument('--epochs', type=int, default=10)
    parser.add_argument('--entropy-coef', type=float, default=0.0)
    args = parser.parse_args()

    logging.basicConfig(level=args.logger_level)

    # Set a random seed used in ChainerRL
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs
    assert process_seeds.max() < 2**32

    args.outdir = experiments.prepare_output_dir(args, args.outdir)

    # def make_env(process_idx, test):
    #     env = gym.make(args.env)
    #     # Use different random seeds for train and test envs
    #     process_seed = int(process_seeds[process_idx])
    #     env_seed = 2 ** 32 - 1 - process_seed if test else process_seed
    #     env.seed(env_seed)
    #     # Cast observations to float32 because our model uses float32
    #     env = chainerrl.wrappers.CastObservationToFloat32(env)
    #     if args.monitor:
    #         env = chainerrl.wrappers.Monitor(env, args.outdir)
    #     if not test:
    #         # Scale rewards (and thus returns) to a reasonable range so that
    #         # training is easier
    #         env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor)
    #     if args.render:
    #         env = chainerrl.wrappers.Render(env)
    #     return env

    def make_env(test):

        env = gym.make(
            "DaktyPushingSimulationEnv-v0",
            level=5,
            simulation_backend="mujoco",
            control_frequency_in_hertz=100,
            state_space_components_to_be_used=None,
            alternate_env_object=None,
            discretization_factor_torque_control_space=None,
            model_as_function_for_pixel_to_latent_space_parsing=(None, None))

        # print('\n############\n', env, '\n############\n')

        env.unwrapped.finger.set_resolution_quality('low')

        # print('\n############\n', env, '\n############\n')

        env = gym.wrappers.TimeLimit(env)

        # print('\n############\n', env, '\n############\n')

        # Unwrap TimeLimit wrapper
        assert isinstance(env, gym.wrappers.TimeLimit)
        env = env.env

        # Use different random seeds for train and test envs
        # env_seed = 2 ** 32 - 1 - args.seed if test else args.seed
        # env.seed(env_seed)

        process_seed = 420

        env_seed = 2**32 - 1 - process_seed if test else process_seed

        env.seed(env_seed)

        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if args.monitor:
            env = chainerrl.wrappers.Monitor(env, args.outdir)
        if args.render and not test:
            env = chainerrl.wrappers.Render(env)
        return env

    def make_batch_env(test):
        return chainerrl.envs.MultiprocessVectorEnv([
            functools.partial(make_env, idx, test)
            for idx, env in enumerate(range(args.num_envs))
        ])

    # Only for getting timesteps, and obs-action spaces
    sample_env = make_env(0)
    timestep_limit = sample_env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = sample_env.observation_space
    action_space = sample_env.action_space

    print('\n\n------------------- obs_space: ', obs_space.shape, '\n\n\n')

    # Normalize observations based on their empirical mean and variance
    obs_normalizer = chainerrl.links.EmpiricalNormalization(obs_space.low.size,
                                                            clip_threshold=5)

    winit_last = chainer.initializers.LeCunNormal(1e-2)

    action_size = action_space.low.size

    policy = chainer.Sequential(
        L.Linear(None, 64),
        F.tanh,
        L.Linear(None, 64),
        F.tanh,
        L.Linear(None, action_size, initialW=winit_last),
        chainerrl.policies.GaussianHeadWithStateIndependentCovariance(
            action_size=action_size,
            var_type='diagonal',
            var_func=lambda x: F.exp(2 * x),  # Parameterize log std
            var_param_init=0,  # log std = 0 => std = 1
        ))

    vf = chainer.Sequential(
        concat_obs_and_action,
        L.Linear(None, 64),
        F.tanh,
        L.Linear(None, 64),
        F.tanh,
        L.Linear(None, 1),
    )

    # Combine a policy and a value function into a single model
    model = chainerrl.links.Branched(policy, vf)

    opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5)
    opt.setup(model)
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))
    agent = PPO(
        model,
        opt,
        obs_normalizer=obs_normalizer,
        gpu=args.gpu,
        update_interval=args.update_interval,
        minibatch_size=args.batchsize,
        epochs=args.epochs,
        clip_eps_vf=None,
        entropy_coef=args.entropy_coef,
        standardize_advantages=args.standardize_advantages,
    )

    if args.load:
        agent.load(args.load)

    if args.demo:
        env = make_env(True)
        eval_stats = experiments.eval_performance(
            env=env,
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:

        env = make_env(False)

        n_episodes = 10000

        # pbar = tqdm(total=n_episodes)

        max_episode_len = 1000
        for i in range(1, n_episodes + 1):

            # pbar.update(1)

            obs = env.reset()
            # print('obs inital..............', obs.shape)
            reward = 0
            done = False
            R = 0  # return (sum of rewards)
            t = 0  # time step

            # pbar = tqdm(total=max_episode_len)

            while not done and t < max_episode_len:

                # pbar.update(1)

                # Uncomment to watch the behaviour
                # env.render()
                action = agent.act_and_train(obs, reward)
                # print('action..................', action)

                obs, reward, done, _ = env.step(action)
                # print('obs.....................', obs)
                # print('reward..................', reward)

                R += reward
                t += 1
            if i % 10 == 0:
                print('episode:', i, 'R:', R, 'statistics:',
                      agent.get_statistics())
            agent.stop_episode_and_train(obs, reward, done)
        print('Finished.')

        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            agent.optimizer.alpha = value

        lr_decay_hook = experiments.LinearInterpolationHook(
            args.steps, args.lr, 0, lr_setter)

        experiments.train_agent_batch_with_evaluation(
            agent=agent,
            env=make_env(False),
            eval_env=make_env(True),
            outdir=args.outdir,
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            log_interval=args.log_interval,
            return_window_size=args.window_size,
            max_episode_len=timestep_limit,
            save_best_so_far_agent=False,
            step_hooks=[
                lr_decay_hook,
            ],
        )
Example #15
0
class rl_stock_trader():
    def __init__(self,
                 path_to_symbol_csv,
                 request_symbols=8,
                 tb_outdir=tb_outdir):

        self.writer = SummaryWriter(tb_outdir)

        self.request_symbols = request_symbols
        self.monitor_freq = 100

        self.start_budget = 10000.

        index_df = pd.read_csv(path_to_symbol_csv)

        # symbol_vec = list(index_df.values[:self.request_symbols,0])

        symbol_vec = list(
            index_df.values[np.random.randint(0, index_df.values.
                                              shape[0], self.request_symbols),
                            0])

        self.dataframe, self.num_symbols = self.get_data(symbol_vec)

        # env = DummyVecEnv([lambda: StockTradingEnv(dataframe)])
        self.env = StockTradingEnv(self.dataframe, self.num_symbols)

        self.tb_action_type = np.zeros(3)
        self.tb_action_symbol = np.zeros(self.num_symbols)
        self.tb_action_vec = []
        self.tb_action_amount = []

        self.tb_balance = np.zeros(4)
        self.tb_net_worth = np.zeros(4)

        self.balance_dummy = []
        self.net_worth_dummy = []
        self.tb_reward = 0.

        self.tb_cache_reward_vec = []
        self.tb_cache_rollout_vec = []

        self.tb_cache_final_net = []
        self.tb_cache_final_balance = []

        self.tb_chache_balance = np.zeros(4)
        self.tb_chache_net_worth = np.zeros(4)

    def get_data(self,
                 symbols,
                 start=None,
                 end=None,
                 period='5y',
                 interval='1d'):
        '''	valid periods: 1d,5d,1mo,3mo,6mo,1y,2y,5y,10y,ytd,max
			fetch data by interval (including intraday if period < 60 days)
			valid intervals: 1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo
			group by ticker (to access via data['SPY']) (optional, default is 'column')
			adjust all OHLC automatically
			download pre/post regular market hours data
			use threads for mass downloading? (True/False/Integer)
			proxy URL scheme use use when downloading? '''

        df_keys = ['Adj Close', 'Close', 'High', 'Low', 'Open', 'Volume']

        if start == None or end == None:

            print('\nload S&P 500 data for period: ', period,
                  ' and interval: ', interval, '\n')

            data_array = yf.download(tickers=symbols,
                                     period=period,
                                     interval=interval,
                                     group_by='column',
                                     auto_adjust=True,
                                     prepost=False,
                                     threads=True,
                                     proxy=None)

        else:

            print('\nload S&P 500 data since: ', start, '/ end: ', end,
                  ' and interval: ', interval, '\n')

            data_array = yf.download(tickers=symbols,
                                     start=start,
                                     end=end,
                                     interval=interval,
                                     group_by='column',
                                     auto_adjust=True,
                                     prepost=False,
                                     threads=True,
                                     proxy=None)

        called_symbols = list(data_array['Volume'].keys())
        try:
            failed_symbols = list(data_array['Adj Close'].keys())
        except KeyError:
            failed_symbols = []
            pass

        loaded_symbols = []

        for i in range(len(called_symbols)):
            if called_symbols[i] not in failed_symbols:
                loaded_symbols.append(called_symbols[i])

        for i in range(len(failed_symbols)):
            for j in range(len(df_keys)):
                data_array = data_array.drop(
                    columns=[(str(df_keys[j]), str(failed_symbols[i]))])

        data_array.insert(0, 'i', np.arange(data_array.shape[0]))

        data_index_axis = data_array.index.values
        data_array = data_array.drop(
            index=[data_index_axis[0], data_index_axis[-1]])

        dfkeys = ['Open', 'Close', 'High', 'Low', 'Volume']

        for dfkey in range(len(dfkeys)):

            data_array[dfkeys[dfkey]].fillna(method='pad')
            data_array[dfkeys[dfkey]].fillna(0.)
            data_array[dfkeys[dfkey]].replace(to_replace=np.nan, value=0.)
            data_array[dfkeys[dfkey]].replace(to_replace='NaN', value=0.)



        print(
         '\n------------------------------------\
			\nsuccesfully loaded stock data\nnumber of loaded data points: '                                                                         , data_array.shape[0], \
         '\nnumber of loaded symbols: ', len(loaded_symbols), '/', len(called_symbols), \
         '\n------------------------------------\n\n', \
         '\ndataframe:\n', data_array, \
         '\n------------------------------------\n\n')

        return data_array, len(loaded_symbols)

    def monitor_training(self, tb_writer, t, i, done, action, monitor_data):
        '''
		after each episode save:
			action_type [3 x 1]  v
			action_amount [1 x 1] (avg /t)  v
			action_symbol [num_symbols x 1]  v
			balance [4x1] (low, avg, high, final)  v
			net_worth [4x1] (low, avg, high, final)  v

		'''

        if t == 0:

            self.balance_dummy = []
            self.net_worth_dummy = []
            self.tb_reward = 0.

            if i == 0:

                self.tb_balance = np.zeros(4)
                self.tb_net_worth = np.zeros(4)

                self.tb_action_amount = []
                self.tb_action_symbol_vec = []

                self.tb_action_vec = []

                self.tb_cache_reward_vec = []
                self.tb_cache_rollout_vec = []

                self.tb_cache_final_net = np.zeros(4)
                self.tb_cache_final_balance = np.zeros(4)

        self.tb_action_symbol_vec.append(monitor_data['action_sym'])

        self.tb_action_amount.append(monitor_data['action_amount'])

        self.tb_action_vec.append(monitor_data['action_type'])

        self.tb_reward += monitor_data['reward']

        self.balance_dummy.append(monitor_data['balance'])
        self.net_worth_dummy.append(monitor_data['net_worth'])

        if done:

            self.tb_cache_reward_vec.append(self.tb_reward)

            self.tb_balance[0] = np.amin(self.balance_dummy)
            self.tb_balance[1] = np.mean(self.balance_dummy)
            self.tb_balance[2] = np.amax(self.balance_dummy)
            self.tb_balance[3] = self.balance_dummy[-1]

            self.tb_net_worth[0] = np.amin(self.net_worth_dummy)
            self.tb_net_worth[1] = np.mean(self.net_worth_dummy)
            self.tb_net_worth[2] = np.amax(self.net_worth_dummy)
            self.tb_net_worth[3] = self.net_worth_dummy[-1]

            self.tb_cache_rollout_vec.append(t)

            if np.ndim(self.tb_cache_final_balance) == 1:
                self.tb_cache_final_balance = np.reshape(
                    self.tb_balance, [1, -1])
                self.tb_cache_final_net = np.reshape(self.tb_net_worth,
                                                     [1, -1])
            else:
                self.tb_cache_final_balance = np.concatenate(
                    (self.tb_cache_final_balance,
                     np.reshape(self.tb_balance, [1, -1])),
                    axis=0)
                self.tb_cache_final_net = np.concatenate(
                    (self.tb_cache_final_net,
                     np.reshape(self.tb_net_worth, [1, -1])),
                    axis=0)

            if i % self.monitor_freq == 0 and i != 0:

                tb_writer.add_scalar('training/reward',
                                     np.mean(self.tb_cache_reward_vec), i)
                tb_writer.add_scalar('training/rollout',
                                     np.mean(self.tb_cache_rollout_vec), i)

                tb_writer.add_scalar(
                    'balance/low', np.mean(self.tb_cache_final_balance[:, 0]),
                    i)
                tb_writer.add_scalar(
                    'balance/avg', np.mean(self.tb_cache_final_balance[:, 1]),
                    i)
                tb_writer.add_scalar(
                    'balance/high', np.mean(self.tb_cache_final_balance[:, 2]),
                    i)
                tb_writer.add_scalar(
                    'balance/final',
                    np.mean(self.tb_cache_final_balance[:, 3]), i)

                tb_writer.add_scalar('net_worth/low',
                                     np.mean(self.tb_cache_final_net[:, 0]), i)
                tb_writer.add_scalar('net_worth/avg',
                                     np.mean(self.tb_cache_final_net[:, 1]), i)
                tb_writer.add_scalar('net_worth/high',
                                     np.mean(self.tb_cache_final_net[:, 2]), i)
                tb_writer.add_scalar('net_worth/final',
                                     np.mean(self.tb_cache_final_net[:, 3]), i)
                tb_writer.add_scalar(
                    'net_worth/profit',
                    np.mean(self.tb_cache_final_net[:, 3] - self.start_budget),
                    i)

                tb_writer.add_histogram('training_stats/reward',
                                        np.asarray(self.tb_cache_reward_vec),
                                        i)
                tb_writer.add_histogram('training_stats/rollout',
                                        np.asarray(self.tb_cache_rollout_vec),
                                        i)

                tb_writer.add_histogram(
                    'performance_stats/final_balance',
                    np.asarray(self.tb_cache_final_balance[:, -1]), i)
                tb_writer.add_histogram(
                    'performance_stats/final_net_worth',
                    np.asarray(self.tb_cache_final_net[:, -1]), i)
                tb_writer.add_histogram(
                    'performance_stats/profit',
                    np.asarray(self.tb_cache_final_net[:, -1] -
                               self.start_budget), i)

                tb_writer.add_histogram('action/type',
                                        np.asarray(self.tb_action_vec), i)
                tb_writer.add_histogram('action/symbol',
                                        np.asarray(self.tb_action_symbol_vec),
                                        i)
                tb_writer.add_histogram('action/action_amount',
                                        np.asarray(self.tb_action_amount), i)

                self.tb_cache_reward_vec = []
                self.tb_cache_rollout_vec = []

                self.tb_cache_final_net = np.zeros(4)
                self.tb_cache_final_balance = np.zeros(4)

                self.tb_action_vec = []

                self.tb_action_symbol_vec = []

                self.tb_action_amount = []

                self.tb_balance = np.zeros(4)
                self.tb_net_worth = np.zeros(4)

    def rl_agent(self, env):

        self.policy = chainer.Sequential(
            L.Linear(None, 256),
            F.tanh,
            L.Linear(None, 128),
            F.tanh,
            # L.Linear(None, env.action_space.low.size, initialW=winit_last),
            L.Linear(None, env.action_space.low.size),
            # F.sigmoid,
            chainerrl.policies.GaussianHeadWithStateIndependentCovariance(
                action_size=env.action_space.low.size,
                var_type='diagonal',
                var_func=lambda x: F.exp(2 * x),  # Parameterize log std
                # var_param_init=0,  # log std = 0 => std = 1
            ))

        self.vf = chainer.Sequential(
            L.Linear(None, 256),
            F.tanh,
            L.Linear(None, 128),
            F.tanh,
            L.Linear(None, 1),
        )

        # Combine a policy and a value function into a single model
        self.model = chainerrl.links.Branched(self.policy, self.vf)

        self.opt = chainer.optimizers.Adam(alpha=3e-4, eps=1e-5)
        self.opt.setup(self.model)

        self.agent = PPO(
            self.model,
            self.opt,
            # obs_normalizer=obs_normalizer,
            gpu=-1,
            update_interval=512,
            minibatch_size=8,
            clip_eps_vf=None,
            entropy_coef=0.001,
            # standardize_advantages=args.standardize_advantages,
        )

        return self.agent

    def train(self):

        print('\nstart training loop\n')

        def check_types(input, inputname):
            if np.isnan(input).any():
                print('----> ', inputname, ' array contains NaN\n',
                      np.isnan(input).shape, '\n')
            if np.isinf(input).any():
                print('----> ', inputname, ' array contains inf\n',
                      np.isinf(input).shape, '\n')

        self.agent = self.rl_agent(self.env)

        n_episodes = 1000000
        max_episode_len = 1000

        for i in range(0, n_episodes + 1):

            obs = self.env.reset()

            reward = 0
            done = False
            R = 0  # return (sum of rewards)
            t = 0  # time step

            while not done and t < max_episode_len:

                # Uncomment to watch the behaviour
                # self.env.render()
                action = self.agent.act_and_train(obs, reward)
                check_types(action, 'action')

                obs, reward, done, _, monitor_data = self.env.step(action)
                check_types(obs, 'obs')
                check_types(reward, 'reward')

                self.monitor_training(self.writer, t, i, done, action,
                                      monitor_data)

                R += reward
                t += 1

                if done: print(' training at episode ' + str(i), end='\r')

            if i % 100 == 0 and i > 0:

                self.agent.save(model_outdir)

                serializers.save_npz(model_outdir + 'model.npz', self.model)

            # if i % 1000 == 0:
            #     print('\nepisode:', i, ' | episode length: ', t, '\nreward:', R,
            #           '\nstatistics:', self.agent.get_statistics(), '\n')

        self.agent.stop_episode_and_train(obs, reward, done)
        print('Finished.')
Example #16
0
def main():
    import logging

    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--env', type=str, default='Hopper-v2')
    parser.add_argument('--num-envs', type=int, default=1)
    parser.add_argument('--arch',
                        type=str,
                        default='FFGaussian',
                        choices=('FFSoftmax', 'FFMellowmax', 'FFGaussian'))
    parser.add_argument('--bound-mean', action='store_true')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 32)')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--steps', type=int, default=10**6)
    parser.add_argument('--eval-interval', type=int, default=10000)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--reward-scale-factor', type=float, default=1e-2)
    parser.add_argument('--standardize-advantages', action='store_true')
    parser.add_argument('--render', action='store_true', default=False)
    parser.add_argument('--lr', type=float, default=3e-4)
    parser.add_argument('--weight-decay', type=float, default=0.0)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--logger-level', type=int, default=logging.DEBUG)
    parser.add_argument('--monitor', action='store_true')
    parser.add_argument('--window-size', type=int, default=100)

    parser.add_argument('--update-interval', type=int, default=2048)
    parser.add_argument('--log-interval', type=int, default=1000)
    parser.add_argument('--batchsize', type=int, default=64)
    parser.add_argument('--epochs', type=int, default=10)
    parser.add_argument('--entropy-coef', type=float, default=0.0)
    args = parser.parse_args()

    #logging.basicConfig(level=args.logger_level)

    # Set a random seed used in ChainerRL
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs
    assert process_seeds.max() < 2**32

    args.outdir = experiments.prepare_output_dir(args, args.outdir)

    def make_env(process_idx, test):
        env = gym.make(args.env)
        # Use different random seeds for train and test envs
        process_seed = int(process_seeds[process_idx])
        env_seed = 2**32 - 1 - process_seed if test else process_seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if args.monitor:
            env = gym.wrappers.Monitor(env, args.outdir)
        if not test:
            # Scale rewards (and thus returns) to a reasonable range so that
            # training is easier
            env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor)
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    def make_batch_env(test):
        return chainerrl.envs.MultiprocessVectorEnv([
            (lambda: make_env(idx, test))
            for idx, env in enumerate(range(args.num_envs))
        ])

    # Only for getting timesteps, and obs-action spaces
    sample_env = gym.make(args.env)
    timestep_limit = sample_env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = sample_env.observation_space
    action_space = sample_env.action_space

    # Normalize observations based on their empirical mean and variance
    obs_normalizer = chainerrl.links.EmpiricalNormalization(obs_space.low.size,
                                                            clip_threshold=5)

    # Switch policy types accordingly to action space types
    if args.arch == 'FFSoftmax':
        model = A3CFFSoftmax(obs_space.low.size, action_space.n)
    elif args.arch == 'FFMellowmax':
        model = A3CFFMellowmax(obs_space.low.size, action_space.n)
    elif args.arch == 'FFGaussian':
        model = A3CFFGaussian(obs_space.low.size,
                              action_space,
                              bound_mean=args.bound_mean)

    opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5)
    opt.setup(model)
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))
    agent = PPO(
        model,
        opt,
        obs_normalizer=obs_normalizer,
        gpu=args.gpu,
        update_interval=args.ppo_update_interval,
        minibatch_size=args.batchsize,
        epochs=args.epochs,
        clip_eps_vf=None,
        entropy_coef=args.entropy_coef,
        standardize_advantages=args.standardize_advantages,
    )

    if args.load:
        agent.load(args.load)

    # Linearly decay the learning rate to zero
    def lr_setter(env, agent, value):
        agent.optimizer.alpha = value

    lr_decay_hook = experiments.LinearInterpolationHook(
        args.steps, args.lr, 0, lr_setter)

    # Linearly decay the clipping parameter to zero
    def clip_eps_setter(env, agent, value):
        agent.clip_eps = value

    clip_eps_decay_hook = experiments.LinearInterpolationHook(
        args.steps, 0.2, 0, clip_eps_setter)

    experiments.train_agent_batch_with_evaluation(
        agent=agent,
        env=make_batch_env(False),
        eval_env=make_batch_env(True),
        outdir=args.outdir,
        steps=args.steps,
        eval_n_runs=args.eval_n_runs,
        eval_interval=args.eval_interval,
        log_interval=args.log_interval,
        return_window_size=args.window_size,
        max_episode_len=timestep_limit,
        save_best_so_far_agent=False,
        step_hooks=[
            lr_decay_hook,
            clip_eps_decay_hook,
        ],
    )
Example #17
0
def main():
    import logging

    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu', type=int, default=0,
                        help='GPU to use, set to -1 if no GPU.')
    parser.add_argument('--env', type=str, default='Hopper-v2',
                        help='OpenAI Gym MuJoCo env to perform algorithm on.')
    parser.add_argument('--num-envs', type=int, default=1,
                        help='Number of envs run in parallel.')
    parser.add_argument('--seed', type=int, default=0,
                        help='Random seed [0, 2 ** 32)')
    parser.add_argument('--outdir', type=str, default='results',
                        help='Directory path to save output files.'
                             ' If it does not exist, it will be created.')
    parser.add_argument('--steps', type=int, default=2 * 10 ** 6,
                        help='Total number of timesteps to train the agent.')
    parser.add_argument('--eval-interval', type=int, default=100000,
                        help='Interval in timesteps between evaluations.')
    parser.add_argument('--eval-n-runs', type=int, default=100,
                        help='Number of episodes run for each evaluation.')
    parser.add_argument('--render', action='store_true',
                        help='Render env states in a GUI window.')
    parser.add_argument('--demo', action='store_true',
                        help='Just run evaluation, not training.')
    parser.add_argument('--load', type=str, default='',
                        help='Directory to load agent from.')
    parser.add_argument('--logger-level', type=int, default=logging.INFO,
                        help='Level of the root logger.')
    parser.add_argument('--monitor', action='store_true',
                        help='Wrap env with gym.wrappers.Monitor.')
    parser.add_argument('--log-interval', type=int, default=1000,
                        help='Interval in timesteps between outputting log'
                             ' messages during training')
    parser.add_argument('--update-interval', type=int, default=2048,
                        help='Interval in timesteps between model updates.')
    parser.add_argument('--epochs', type=int, default=10,
                        help='Number of epochs to update model for per PPO'
                             ' iteration.')
    parser.add_argument('--batch-size', type=int, default=64,
                        help='Minibatch size')
    args = parser.parse_args()

    logging.basicConfig(level=args.logger_level)

    # Set a random seed used in ChainerRL
    misc.set_random_seed(args.seed, gpus=(args.gpu,))

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs
    assert process_seeds.max() < 2 ** 32

    args.outdir = experiments.prepare_output_dir(args, args.outdir)

    def make_env(process_idx, test):
        env = gym.make(args.env)
        # Use different random seeds for train and test envs
        process_seed = int(process_seeds[process_idx])
        env_seed = 2 ** 32 - 1 - process_seed if test else process_seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if args.monitor:
            env = gym.wrappers.Monitor(env, args.outdir)
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    def make_batch_env(test):
        return chainerrl.envs.MultiprocessVectorEnv(
            [functools.partial(make_env, idx, test)
             for idx, env in enumerate(range(args.num_envs))])

    # Only for getting timesteps, and obs-action spaces
    sample_env = gym.make(args.env)
    timestep_limit = sample_env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = sample_env.observation_space
    action_space = sample_env.action_space
    print('Observation space:', obs_space)
    print('Action space:', action_space)

    assert isinstance(action_space, gym.spaces.Box)

    # Normalize observations based on their empirical mean and variance
    obs_normalizer = chainerrl.links.EmpiricalNormalization(
        obs_space.low.size, clip_threshold=5)

    # While the original paper initialized weights by normal distribution,
    # we use orthogonal initialization as the latest openai/baselines does.
    winit = chainerrl.initializers.Orthogonal(1.)
    winit_last = chainerrl.initializers.Orthogonal(1e-2)

    action_size = action_space.low.size
    policy = chainer.Sequential(
        L.Linear(None, 64, initialW=winit),
        F.tanh,
        L.Linear(None, 64, initialW=winit),
        F.tanh,
        L.Linear(None, action_size, initialW=winit_last),
        chainerrl.policies.GaussianHeadWithStateIndependentCovariance(
            action_size=action_size,
            var_type='diagonal',
            var_func=lambda x: F.exp(2 * x),  # Parameterize log std
            var_param_init=0,  # log std = 0 => std = 1
        ),
    )

    vf = chainer.Sequential(
        L.Linear(None, 64, initialW=winit),
        F.tanh,
        L.Linear(None, 64, initialW=winit),
        F.tanh,
        L.Linear(None, 1, initialW=winit),
    )

    # Combine a policy and a value function into a single model
    model = chainerrl.links.Branched(policy, vf)

    opt = chainer.optimizers.Adam(3e-4, eps=1e-5)
    opt.setup(model)

    agent = PPO(
        model,
        opt,
        obs_normalizer=obs_normalizer,
        gpu=args.gpu,
        update_interval=args.update_interval,
        minibatch_size=args.batch_size,
        epochs=args.epochs,
        clip_eps_vf=None,
        entropy_coef=0,
        standardize_advantages=True,
        gamma=0.995,
        lambd=0.97,
    )

    if args.load:
        agent.load(args.load)

    if args.demo:
        env = make_batch_env(True)
        eval_stats = experiments.eval_performance(
            env=env,
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_batch_with_evaluation(
            agent=agent,
            env=make_batch_env(False),
            eval_env=make_batch_env(True),
            outdir=args.outdir,
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            log_interval=args.log_interval,
            max_episode_len=timestep_limit,
            save_best_so_far_agent=False,
        )
Example #18
0
            chainerrl.distribution.SoftmaxDistribution,
        ),
        L.Linear(None, 1),
    ))

opt = chainer.optimizers.Adam()
opt.setup(model)
opt.add_hook(chainer.optimizer.GradientClipping(0.5))


def phi(x):
    # Feature extractor
    return np.asarray(x, dtype=np.float32)


agent = PPO(model, opt, phi=phi)

# experiments.train_agent_with_evaluation(
#     agent=agent,
#     steps=2000,
#     env=env,
#     eval_n_steps=None,
#     eval_max_episode_len=100,
#     eval_n_episodes=5,
#     eval_interval=3,
#     outdir="test2"
# )

# Set the discount factor that discounts future rewards.
gamma = 0.95
Example #19
0
def main():

    # Prevent numpy from using multiple threads
    os.environ['OMP_NUM_THREADS'] = '1'

    import logging
    logging.basicConfig(level=logging.DEBUG)

    parser = argparse.ArgumentParser()
    parser.add_argument('rom', type=str)
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--outdir', type=str, default=None)
    parser.add_argument('--use-sdl', action='store_true')
    parser.add_argument('--max-episode-len', type=int, default=10000)
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--steps', type=int, default=8 * 10**7)
    parser.add_argument('--lr', type=float, default=2.5e-4)

    parser.add_argument('--eval-interval', type=int, default=10**6)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--standardize-advantages', action='store_true')
    parser.add_argument('--weight-decay', type=float, default=0.0)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')

    # In the original paper, agent runs in 8 environments parallely
    # and samples 128 steps per environment.
    # Sample 128 * 8 steps, instead.
    parser.add_argument('--update-interval', type=int, default=128 * 8)

    parser.add_argument('--batchsize', type=int, default=32)
    parser.add_argument('--epochs', type=int, default=3)
    parser.set_defaults(use_sdl=False)
    args = parser.parse_args()

    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    n_actions = ale.ALE(args.rom).number_of_actions

    model = A3CFF(n_actions)
    opt = chainer.optimizers.Adam(alpha=args.lr)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))
    agent = PPO(
        model,
        opt,
        gpu=args.gpu,
        phi=dqn_phi,
        update_interval=args.update_interval,
        minibatch_size=args.batchsize,
        epochs=args.epochs,
        clip_eps=0.1,
        clip_eps_vf=None,
        standardize_advantages=args.standardize_advantages,
    )
    if args.load:
        agent.load(args.load)

    def make_env(test):
        # Use different random seeds for train and test envs
        env_seed = 2**31 - 1 - args.seed if test else args.seed
        env = ale.ALE(args.rom,
                      use_sdl=args.use_sdl,
                      treat_life_lost_as_terminal=not test,
                      seed=env_seed)
        if not test:
            misc.env_modifiers.make_reward_clipped(env, -1, 1)
        return env

    if args.demo:
        env = make_env(True)
        eval_stats = experiments.eval_performance(env=env,
                                                  agent=agent,
                                                  n_runs=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev: {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            agent.optimizer.alpha = value

        lr_decay_hook = experiments.LinearInterpolationHook(
            args.steps, args.lr, 0, lr_setter)

        # Linearly decay the clipping parameter to zero
        def clip_eps_setter(env, agent, value):
            agent.clip_eps = value

        clip_eps_decay_hook = experiments.LinearInterpolationHook(
            args.steps, 0.1, 0, clip_eps_setter)

        experiments.train_agent_with_evaluation(
            agent=agent,
            env=make_env(False),
            eval_env=make_env(True),
            outdir=args.outdir,
            steps=args.steps,
            eval_n_runs=args.eval_n_runs,
            eval_interval=args.eval_interval,
            max_episode_len=args.max_episode_len,
            step_hooks=[
                lr_decay_hook,
                clip_eps_decay_hook,
            ],
        )
Example #20
0
def main(args, train_env):
    logging.basicConfig(level=args.logger_level)

    # Set a random seed used in ChainerRL
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))
    if not (args.demo and args.load):
        args.outdir = experiments.prepare_output_dir(args, args.outdir)
    temp = args.outdir.split('/')[-1]
    dst = args.outdir.strip(temp)

    def make_env(test):
        env = gym.make(args.env)
        if test:
            episode_length = args.eval_episode_length
        else:
            episode_length = args.episode_length

        env.initialize_environment(
            case=args.state_rep,
            n_historical_events=args.n_historical_events,
            episode_length=episode_length,
            n_experts=args.n_experts,
            n_demos_per_expert=1,
            n_expert_time_steps=args.length_expert_TS,
            seed_agent=args.seed_agent,
            seed_expert=args.seed_expert,
            adam_days=args.adam_days)

        # Use different random seeds for train and test envs
        env_seed = 2**32 - 1 - args.seed if test else args.seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if args.monitor:
            env = gym.wrappers.Monitor(env, args.outdir)
        if not test:
            # Scale rewards (and thus returns) to a reasonable range so that
            # training is easier
            env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor)
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    sample_env = gym.make(args.env)
    sample_env.initialize_environment(
        case=args.state_rep,
        n_historical_events=args.n_historical_events,
        episode_length=args.episode_length,
        n_experts=args.n_experts,
        n_demos_per_expert=1,
        n_expert_time_steps=args.length_expert_TS,
        seed_agent=args.seed_agent,
        seed_expert=args.seed_expert,
        adam_days=args.adam_days)
    demonstrations = sample_env.generate_expert_trajectories(out_dir=dst,
                                                             eval=False)
    timestep_limit = None  #sample_env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps')  # This value is None

    # Generate expert data for evaluation
    temp_env = gym.make(args.env)
    temp_env.initialize_environment(
        case=args.state_rep,
        n_historical_events=args.n_historical_events,
        episode_length=
        0,  # This parameter does not really matter since we create this env only for generating samples
        n_experts=args.n_experts,
        n_demos_per_expert=1,  # We do not perform any clustering right now
        # n_demos_per_expert=args.n_demos_per_expert,  # How large should the expert cluster be?
        n_expert_time_steps=args.
        eval_episode_length,  # How long should each expert trajectory be?
        seed_expert=args.seed_expert,
        adam_days=args.adam_days)
    temp_env.generate_expert_trajectories(out_dir=dst, eval=True)

    obs_space = sample_env.observation_space
    action_space = sample_env.action_space

    # Normalize observations based on their empirical mean and variance
    if args.state_rep == 1:
        obs_dim = obs_space.low.size
    elif args.state_rep == 2 or args.state_rep == 21 or args.state_rep == 22 or args.state_rep == 24 or args.state_rep == 4 or args.state_rep == 221 or args.state_rep == 222 \
    or args.state_rep == 71 or args.state_rep == 17 or args.state_rep == 81:
        obs_dim = obs_space.n
    elif args.state_rep == 3 or args.state_rep == 11 or args.state_rep == 23 or args.state_rep == 31 or args.state_rep == 7:
        obs_dim = obs_space.nvec.size
    else:
        raise NotImplementedError

    if args.normalize_obs:
        obs_normalizer = chainerrl.links.EmpiricalNormalization(
            obs_dim,
            clip_threshold=5)  # shape: Shape of input values except batch axis
    else:
        obs_normalizer = None

    # Switch policy types accordingly to action space types
    if args.arch == 'FFSoftmax':
        model = A3CFFSoftmax(obs_dim,
                             action_space.n,
                             hidden_sizes=args.G_layers)
    elif args.arch == 'FFMellowmax':
        model = A3CFFMellowmax(obs_space.low.size, action_space.n)
    elif args.arch == 'FFGaussian':
        model = A3CFFGaussian(obs_space.low.size,
                              action_space,
                              bound_mean=args.bound_mean)

    opt = chainer.optimizers.Adam(alpha=args.lr, eps=10e-1)
    opt.setup(model)

    if args.show_D_dummy:  # Let discriminator see dummy
        input_dim_D = obs_dim + 1
    elif not args.show_D_dummy:  # Do not let discriminator see dummy
        if args.state_rep == 21 or args.state_rep == 17:
            input_dim_D = obs_dim + 1
        else:
            input_dim_D = obs_dim + 1 - args.n_experts

    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))
    if args.algo == 'ppo':
        agent = PPO(
            model,
            opt,
            obs_normalizer=obs_normalizer,
            gpu=args.gpu,
            update_interval=args.update_interval,
            minibatch_size=args.batchsize,
            epochs=args.epochs,
            clip_eps_vf=None,
            entropy_coef=args.entropy_coef,
            standardize_advantages=args.standardize_advantages,
        )
    elif args.algo == 'gail':
        from customer_behaviour.algorithms.irl.gail import GAIL as G
        from customer_behaviour.algorithms.irl.gail import Discriminator as D

        demonstrations = np.load(dst + '/expert_trajectories.npz')
        D = D(gpu=args.gpu,
              input_dim=input_dim_D,
              hidden_sizes=args.D_layers,
              loss_type=args.loss_type)

        agent = G(env=sample_env,
                  demonstrations=demonstrations,
                  discriminator=D,
                  model=model,
                  optimizer=opt,
                  obs_normalizer=obs_normalizer,
                  gpu=args.gpu,
                  update_interval=args.update_interval,
                  minibatch_size=args.batchsize,
                  epochs=args.epochs,
                  clip_eps_vf=None,
                  entropy_coef=args.entropy_coef,
                  standardize_advantages=args.standardize_advantages,
                  args=args)

    elif args.algo == 'airl':
        from customer_behaviour.algorithms.irl.airl import AIRL as G
        from customer_behaviour.algorithms.irl.airl import Discriminator as D
        # obs_normalizer = None
        demonstrations = np.load(dst + '/expert_trajectories.npz')
        D = D(gpu=args.gpu,
              input_dim=input_dim_D - 1,
              hidden_sizes=args.D_layers)  # AIRL only inputs state to D

        agent = G(env=sample_env,
                  demonstrations=demonstrations,
                  discriminator=D,
                  model=model,
                  optimizer=opt,
                  obs_normalizer=obs_normalizer,
                  gpu=args.gpu,
                  update_interval=args.update_interval,
                  minibatch_size=args.batchsize,
                  epochs=args.epochs,
                  clip_eps_vf=None,
                  entropy_coef=args.entropy_coef,
                  standardize_advantages=args.standardize_advantages,
                  noise=args.noise,
                  n_experts=args.n_experts,
                  episode_length=args.episode_length,
                  adam_days=args.adam_days,
                  dummy_D=args.show_D_dummy)

    elif args.algo == 'mmct-gail':
        from customer_behaviour.algorithms.irl.gail.mmct_gail import MMCTGAIL as G
        from customer_behaviour.algorithms.irl.gail import Discriminator as D

        demonstrations = np.load(dst + '/expert_trajectories.npz')
        D = D(gpu=args.gpu,
              input_dim=input_dim_D,
              hidden_sizes=args.D_layers,
              loss_type=args.loss_type)

        agent = G(env=sample_env,
                  demonstrations=demonstrations,
                  discriminator=D,
                  model=model,
                  optimizer=opt,
                  obs_normalizer=obs_normalizer,
                  gpu=args.gpu,
                  update_interval=args.update_interval,
                  minibatch_size=args.batchsize,
                  epochs=args.epochs,
                  clip_eps_vf=None,
                  entropy_coef=args.entropy_coef,
                  standardize_advantages=args.standardize_advantages,
                  args=args)

    if args.load:
        # By default, not in here
        agent.load(args.load)

    if args.demo:
        # By default, not in here
        env = make_env(True)
        eval_stats = experiments.eval_performance(
            env=env,
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
        outdir = args.load if args.load else args.outdir
        save_agent_demo(make_env(False), agent, outdir)
    else:
        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            agent.optimizer.alpha = value

        lr_decay_hook = experiments.LinearInterpolationHook(
            args.steps, args.lr, 0, lr_setter)

        # Linearly decay the clipping parameter to zero
        def clip_eps_setter(env, agent, value):
            agent.clip_eps = max(value, 1e-8)

        clip_eps_decay_hook = experiments.LinearInterpolationHook(
            args.steps, 0.2, 0, clip_eps_setter)

        if train_env is None:
            experiments.train_agent_with_evaluation(
                agent=agent,
                env=make_env(
                    False
                ),  # Environment train the agent against (False -> scaled rewards)
                eval_env=make_env(True),  # Environment used for evaluation
                outdir=args.outdir,
                steps=args.
                steps,  # Total number of timesteps for training (args.n_training_episodes*args.episode_length)
                eval_n_steps=
                None,  # Number of timesteps at each evaluation phase
                eval_n_episodes=args.
                eval_n_runs,  # Number of episodes at each evaluation phase (default: 10)
                eval_interval=args.
                eval_interval,  # Interval of evaluation (defualt: 10000 steps (?))
                train_max_episode_len=
                timestep_limit,  # Maximum episode length during training (is None)
                save_best_so_far_agent=False,
                step_hooks=[
                    lr_decay_hook,
                    clip_eps_decay_hook,
                ],
                checkpoint_freq=args.eval_interval)
        else:
            experiments.train_agent_batch_with_evaluation(
                agent=agent,
                env=train_env,
                steps=args.steps,
                eval_n_steps=None,
                eval_n_episodes=args.eval_n_runs,
                eval_interval=args.eval_interval,
                outdir=args.outdir,
                max_episode_len=timestep_limit,
                eval_max_episode_len=None,
                eval_env=make_env(True),
                step_hooks=[
                    lr_decay_hook,
                    clip_eps_decay_hook,
                ],
                save_best_so_far_agent=False,
                checkpoint_freq=args.eval_interval,
                log_interval=args.update_interval)

        save_agent_demo(
            make_env(True), agent, args.outdir, 10 * args.eval_episode_length
        )  # originally it was make_env(test=False) which seems strange

    # Move result files to correct folder and remove empty folder
    move_dir(args.outdir, dst)
    os.rmdir(args.outdir)

    if args.save_results:
        print('Saving result...')
        res2.save_data(dst, 10000, 50, N=1)

        print('Running evaluate policy...')
        ep.eval_policy(a_dir_path=dst)

    # else:
    #     if args.n_experts <= 10:
    #         print('Running evaluate policy...')
    #         ep.eval_policy(a_dir_path=dst)
    #         # print('Running evaluate training...')
    #         # ets.eval_training(a_dir_path=dst)
    #         print('Done')

    if args.save_report_material:
        print('Saving dataframe...')
        if args.state_rep == 21:
            if args.algo == 'gail':
                folder_name = 'gail'
            elif args.algo == 'airl':
                folder_name = 'airl'
        elif args.state_rep == 22:
            if args.algo == 'gail':
                folder_name = 'gail_dummies'
            elif args.algo == 'airl':
                folder_name = 'airl_dummies'
        elif args.state_rep == 81:
            if args.algo == 'gail':
                folder_name = 'gail_adams'
            elif args.algo == 'airl':
                folder_name = 'airl_adams'
        elif args.state_rep == 17:
            folder_name = 'ail'
        elif args.state_rep == 221:
            folder_name = 'ail_dummies'
        elif args.state_rep == 71:
            folder_name = 'ail_adams'

        report_material.save_df(dst, folder_name)

    if args.save_folder is not None:
        print('Saving result to ' + args.save_folder)
        os.makedirs(os.path.join(os.getcwd(), args.save_folder), exist_ok=True)
        from distutils.dir_util import copy_tree
        copy_tree(
            os.path.join(os.getcwd(), dst),
            os.path.join(os.getcwd(), args.save_folder,
                         args.outdir.split('/')[-2]))