def rl_agent(self, env): self.policy = chainer.Sequential( L.Linear(None, 256), F.tanh, L.Linear(None, 128), F.tanh, # L.Linear(None, env.action_space.low.size, initialW=winit_last), L.Linear(None, env.action_space.low.size), # F.sigmoid, chainerrl.policies.GaussianHeadWithStateIndependentCovariance( action_size=env.action_space.low.size, var_type='diagonal', var_func=lambda x: F.exp(2 * x), # Parameterize log std # var_param_init=0, # log std = 0 => std = 1 )) self.vf = chainer.Sequential( L.Linear(None, 256), F.tanh, L.Linear(None, 128), F.tanh, L.Linear(None, 1), ) # Combine a policy and a value function into a single model self.model = chainerrl.links.Branched(self.policy, self.vf) self.opt = chainer.optimizers.Adam(alpha=3e-4, eps=1e-5) self.opt.setup(self.model) self.agent = PPO( self.model, self.opt, # obs_normalizer=obs_normalizer, gpu=-1, update_interval=512, minibatch_size=8, clip_eps_vf=None, entropy_coef=0.001, # standardize_advantages=args.standardize_advantages, ) return self.agent
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('algo', default='ppo', choices=['ppo', 'gail', 'airl'], type=str) parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--env', type=str, default='Hopper-v2') parser.add_argument('--arch', type=str, default='FFGaussian', choices=('FFSoftmax', 'FFMellowmax', 'FFGaussian')) parser.add_argument('--bound-mean', action='store_true') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--steps', type=int, default=10 ** 6) parser.add_argument('--eval-interval', type=int, default=10000) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--reward-scale-factor', type=float, default=1e-2) parser.add_argument('--standardize-advantages', action='store_true') parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--lr', type=float, default=3e-4) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--load_demo', type=str, default='') parser.add_argument('--logger-level', type=int, default=logging.DEBUG) parser.add_argument('--monitor', action='store_true') parser.add_argument('--update-interval', type=int, default=2048) parser.add_argument('--batchsize', type=int, default=64) parser.add_argument('--epochs', type=int, default=10) parser.add_argument('--entropy-coef', type=float, default=0.0) args = parser.parse_args() logging.basicConfig(level=args.logger_level) # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu,)) if not (args.demo and args.load): args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(test): env = gym.make(args.env) # Use different random seeds for train and test envs env_seed = 2 ** 32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = gym.wrappers.Monitor(env, args.outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if args.render: env = chainerrl.wrappers.Render(env) return env sample_env = gym.make(args.env) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space # Normalize observations based on their empirical mean and variance obs_normalizer = chainerrl.links.EmpiricalNormalization( obs_space.low.size, clip_threshold=5) # Switch policy types accordingly to action space types if args.arch == 'FFSoftmax': model = A3CFFSoftmax(obs_space.low.size, action_space.n) elif args.arch == 'FFMellowmax': model = A3CFFMellowmax(obs_space.low.size, action_space.n) elif args.arch == 'FFGaussian': model = A3CFFGaussian(obs_space.low.size, action_space, bound_mean=args.bound_mean) opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5) opt.setup(model) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) if args.algo == 'ppo': agent = PPO(model, opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages, ) elif args.algo == 'gail': import numpy as np from irl.gail import GAIL from irl.gail import Discriminator demonstrations = np.load(args.load_demo) D = Discriminator(gpu=args.gpu) agent = GAIL(demonstrations=demonstrations, discriminator=D, model=model, optimizer=opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages,) elif args.algo == 'airl': import numpy as np from irl.airl import AIRL as Agent from irl.airl import Discriminator # obs_normalizer = None demonstrations = np.load(args.load_demo) D = Discriminator(gpu=args.gpu) agent = Agent(demonstrations=demonstrations, discriminator=D, model=model, optimizer=opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages,) if args.load: agent.load(args.load) if args.demo: env = make_env(True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) outdir = args.load if args.load else args.outdir save_agent_demo(make_env(False), agent, outdir) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.alpha = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) # Linearly decay the clipping parameter to zero def clip_eps_setter(env, agent, value): agent.clip_eps = max(value, 1e-8) clip_eps_decay_hook = experiments.LinearInterpolationHook( args.steps, 0.2, 0, clip_eps_setter) experiments.train_agent_with_evaluation( agent=agent, env=make_env(False), eval_env=make_env(True), outdir=args.outdir, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, train_max_episode_len=timestep_limit, save_best_so_far_agent=False, step_hooks=[ lr_decay_hook, clip_eps_decay_hook, ], ) save_agent_demo(make_env(False), agent, args.outdir)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4', help='Gym Env ID.') parser.add_argument('--gpu', type=int, default=0, help='GPU device ID. Set to -1 to use CPUs only.') parser.add_argument('--num-envs', type=int, default=8, help='Number of env instances run in parallel.') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--steps', type=int, default=10**7, help='Total time steps for training.') parser.add_argument( '--max-frames', type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help='Maximum number of frames for each episode.') parser.add_argument('--lr', type=float, default=2.5e-4, help='Learning rate.') parser.add_argument('--eval-interval', type=int, default=100000, help='Interval (in timesteps) between evaluation' ' phases.') parser.add_argument('--eval-n-runs', type=int, default=10, help='Number of episodes ran in an evaluation phase.') parser.add_argument('--demo', action='store_true', default=False, help='Run demo episodes, not training.') parser.add_argument('--load', type=str, default='', help='Directory path to load a saved agent data from' ' if it is a non-empty string.') parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--render', action='store_true', default=False, help='Render env states in a GUI window.') parser.add_argument('--monitor', action='store_true', default=False, help='Monitor env. Videos and additional information' ' are saved as output files.') parser.add_argument('--update-interval', type=int, default=128 * 8, help='Interval (in timesteps) between PPO iterations.') parser.add_argument('--batchsize', type=int, default=32 * 8, help='Size of minibatch (in timesteps).') parser.add_argument('--epochs', type=int, default=4, help='Number of epochs used for each PPO iteration.') parser.add_argument('--log-interval', type=int, default=10000, help='Interval (in timesteps) of printing logs.') parser.add_argument('--recurrent', action='store_true', default=False, help='Use a recurrent model. See the code for the' ' model definition.') parser.add_argument('--flicker', action='store_true', default=False, help='Use so-called flickering Atari, where each' ' screen is blacked out with probability 0.5.') parser.add_argument('--no-frame-stack', action='store_true', default=False, help='Disable frame stacking so that the agent can' ' only see the current screen.') parser.add_argument('--checkpoint-frequency', type=int, default=None, help='Frequency at which agents are stored.') args = parser.parse_args() import logging logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. misc.set_random_seed(args.seed, gpus=(args.gpu, )) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) def make_env(idx, test): # Use different random seeds for train and test envs process_seed = int(process_seeds[idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test, flicker=args.flicker, frame_stack=not args.no_frame_stack, ) env.seed(env_seed) if args.monitor: env = chainerrl.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env def make_batch_env(test): return chainerrl.envs.MultiprocessVectorEnv([ (lambda: make_env(idx, test)) for idx, env in enumerate(range(args.num_envs)) ]) sample_env = make_env(0, test=False) print('Observation space', sample_env.observation_space) print('Action space', sample_env.action_space) n_actions = sample_env.action_space.n winit_last = chainer.initializers.LeCunNormal(1e-2) if args.recurrent: model = chainerrl.links.StatelessRecurrentSequential( L.Convolution2D(None, 32, 8, stride=4), F.relu, L.Convolution2D(None, 64, 4, stride=2), F.relu, L.Convolution2D(None, 64, 3, stride=1), F.relu, L.Linear(None, 512), F.relu, L.NStepGRU(1, 512, 512, 0), chainerrl.links.Branched( chainer.Sequential( L.Linear(None, n_actions, initialW=winit_last), chainerrl.distribution.SoftmaxDistribution, ), L.Linear(None, 1), )) else: model = chainer.Sequential( L.Convolution2D(None, 32, 8, stride=4), F.relu, L.Convolution2D(None, 64, 4, stride=2), F.relu, L.Convolution2D(None, 64, 3, stride=1), F.relu, L.Linear(None, 512), F.relu, chainerrl.links.Branched( chainer.Sequential( L.Linear(None, n_actions, initialW=winit_last), chainerrl.distribution.SoftmaxDistribution, ), L.Linear(None, 1), )) # Draw the computational graph and save it in the output directory. fake_obss = np.zeros(sample_env.observation_space.shape, dtype=np.float32)[None] if args.recurrent: fake_out, _ = model(fake_obss, None) else: fake_out = model(fake_obss) chainerrl.misc.draw_computational_graph([fake_out], os.path.join(args.outdir, 'model')) opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(0.5)) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = PPO( model, opt, gpu=args.gpu, phi=phi, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps=0.1, clip_eps_vf=None, standardize_advantages=True, entropy_coef=1e-2, recurrent=args.recurrent, ) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance( env=make_batch_env(test=True), agent=agent, n_steps=None, n_episodes=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev: {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: step_hooks = [] # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.alpha = value step_hooks.append( experiments.LinearInterpolationHook(args.steps, args.lr, 0, lr_setter)) experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(False), eval_env=make_batch_env(True), outdir=args.outdir, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, checkpoint_freq=args.checkpoint_frequency, eval_interval=args.eval_interval, log_interval=args.log_interval, save_best_so_far_agent=False, step_hooks=step_hooks, )
'wrapper_config.TimeLimit.max_episode_steps') obs_space = env.observation_space action_space = env.action_space model = A3CFFSoftmax(obs_space.low.size, action_space.n) opt = chainer.optimizers.Adam(alpha=lr, eps=1e-5) opt.setup(model) # Initialize the agent agent = PPO( model, opt, gpu=gpu, phi=phi, update_interval=update_interval, minibatch_size=64, epochs=10, clip_eps_vf=None, entropy_coef=0.0, ) # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.alpha = value lr_decay_hook = experiments.LinearInterpolationHook(steps, 3e-4, 0, lr_setter)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4') parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--steps', type=int, default=10**7) parser.add_argument( '--max-episode-len', type=int, default=5 * 60 * 60 // 4, # 5 minutes with 60/4 fps help='Maximum number of steps for each episode.') parser.add_argument('--lr', type=float, default=2.5e-4) parser.add_argument('--eval-interval', type=int, default=10**5) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--standardize-advantages', action='store_true') parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--render', action='store_true', default=False, help='Render env states in a GUI window.') parser.add_argument('--monitor', action='store_true', default=False, help='Monitor env. Videos and additional information' ' are saved as output files.') # In the original paper, agent runs in 8 environments parallely # and samples 128 steps per environment. # Sample 128 * 8 steps, instead. parser.add_argument('--update-interval', type=int, default=128 * 8) parser.add_argument('--batchsize', type=int, default=32) parser.add_argument('--epochs', type=int, default=3) args = parser.parse_args() import logging logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. misc.set_random_seed(args.seed, gpus=(args.gpu, )) # Set different random seeds for train and test envs. train_seed = args.seed test_seed = 2**31 - 1 - args.seed args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) def make_env(test): # Use different random seeds for train and test envs env_seed = test_seed if test else train_seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(args.env), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if args.monitor: env = gym.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env env = make_env(test=False) eval_env = make_env(test=True) n_actions = env.action_space.n model = A3CFF(n_actions) opt = chainer.optimizers.Adam(alpha=args.lr) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = PPO( model, opt, gpu=args.gpu, phi=phi, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps=0.1, clip_eps_vf=None, standardize_advantages=args.standardize_advantages, ) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance(env=eval_env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev: {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.alpha = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) # Linearly decay the clipping parameter to zero def clip_eps_setter(env, agent, value): agent.clip_eps = max(value, 1e-8) clip_eps_decay_hook = experiments.LinearInterpolationHook( args.steps, 0.1, 0, clip_eps_setter) experiments.train_agent_with_evaluation( agent=agent, env=env, eval_env=eval_env, outdir=args.outdir, steps=args.steps, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, train_max_episode_len=args.max_episode_len, save_best_so_far_agent=False, step_hooks=[ lr_decay_hook, clip_eps_decay_hook, ], )
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--env', type=str, default='Hopper-v1') parser.add_argument('--arch', type=str, default='FFGaussian', choices=('FFSoftmax', 'FFMellowmax', 'FFGaussian')) parser.add_argument('--normalize-obs', action='store_true') parser.add_argument('--bound-mean', action='store_true') parser.add_argument('--seed', type=int, default=None) parser.add_argument('--outdir', type=str, default=None) parser.add_argument('--steps', type=int, default=10**6) parser.add_argument('--eval-interval', type=int, default=10000) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--reward-scale-factor', type=float, default=1e-2) parser.add_argument('--standardize-advantages', action='store_true') parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--lr', type=float, default=3e-4) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logger-level', type=int, default=logging.DEBUG) parser.add_argument('--monitor', action='store_true') parser.add_argument('--update-interval', type=int, default=2048) parser.add_argument('--batchsize', type=int, default=64) parser.add_argument('--epochs', type=int, default=10) parser.add_argument('--entropy-coef', type=float, default=0.0) args = parser.parse_args() logging.getLogger().setLevel(args.logger_level) if args.seed is not None: misc.set_random_seed(args.seed) args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(test): env = gym.make(args.env) if args.monitor: env = gym.wrappers.Monitor(env, args.outdir) # Scale rewards observed by agents if args.reward_scale_factor and not test: misc.env_modifiers.make_reward_filtered( env, lambda x: x * args.reward_scale_factor) if args.render: misc.env_modifiers.make_rendered(env) return env sample_env = gym.make(args.env) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space # Switch policy types accordingly to action space types if args.arch == 'FFSoftmax': model = A3CFFSoftmax(obs_space.low.size, action_space.n) elif args.arch == 'FFMellowmax': model = A3CFFMellowmax(obs_space.low.size, action_space.n) elif args.arch == 'FFGaussian': model = A3CFFGaussian(obs_space.low.size, action_space, bound_mean=args.bound_mean, normalize_obs=args.normalize_obs) opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5) opt.setup(model) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = PPO( model, opt, gpu=args.gpu, phi=phi, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages, ) if args.load: agent.load(args.load) if args.demo: env = make_env(True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_runs=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.alpha = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) # Linearly decay the clipping parameter to zero def clip_eps_setter(env, agent, value): agent.clip_eps = value clip_eps_decay_hook = experiments.LinearInterpolationHook( args.steps, 0.2, 0, clip_eps_setter) experiments.train_agent_with_evaluation( agent=agent, env=make_env(False), eval_env=make_env(True), outdir=args.outdir, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=timestep_limit, step_hooks=[ lr_decay_hook, clip_eps_decay_hook, ], )
def main(args): import logging logging.basicConfig(level=logging.INFO, filename='log') if(type(args) is list): args=make_args(args) if not os.path.exists(args.outdir): os.makedirs(args.outdir) # Set a random seed used in ChainerRL. misc.set_random_seed(args.seed, gpus=(args.gpu,)) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2 ** 32 def make_env(idx, test): # Use different random seeds for train and test envs process_seed = int(process_seeds[idx]) env_seed = 2 ** 32 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test, flicker=args.flicker, frame_stack=not args.no_frame_stack, ) env.seed(env_seed) if args.monitor: env = chainerrl.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env def make_env_check(): # Use different random seeds for train and test envs env_seed = args.seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=True, clip_rewards=True) env.seed(int(env_seed)) return env def make_batch_env(test): return chainerrl.envs.MultiprocessVectorEnv( [(lambda: make_env(idx, test)) for idx, env in enumerate(range(args.num_envs))]) sample_env = make_env(0, test=False) print('Observation space', sample_env.observation_space) print('Action space', sample_env.action_space) n_actions = sample_env.action_space.n winit_last = chainer.initializers.LeCunNormal(1e-2) if args.recurrent: model = chainerrl.links.StatelessRecurrentSequential( L.Convolution2D(None, 32, 8, stride=4), F.relu, L.Convolution2D(None, 64, 4, stride=2), F.relu, L.Convolution2D(None, 64, 3, stride=1), F.relu, L.Linear(None, 512), F.relu, L.NStepGRU(1, 512, 512, 0), chainerrl.links.Branched( chainer.Sequential( L.Linear(None, n_actions, initialW=winit_last), chainerrl.distribution.SoftmaxDistribution, ), L.Linear(None, 1), ) ) else: model = chainer.Sequential( L.Convolution2D(None, 32, 8, stride=4), F.relu, L.Convolution2D(None, 64, 4, stride=2), F.relu, L.Convolution2D(None, 64, 3, stride=1), F.relu, L.Linear(None, 512), F.relu, chainerrl.links.Branched( chainer.Sequential( L.Linear(None, n_actions, initialW=winit_last), chainerrl.distribution.SoftmaxDistribution, ), L.Linear(None, 1), ) ) # Draw the computational graph and save it in the output directory. fake_obss = np.zeros( sample_env.observation_space.shape, dtype=np.float32)[None] if args.recurrent: fake_out, _ = model(fake_obss, None) else: fake_out = model(fake_obss) chainerrl.misc.draw_computational_graph( [fake_out], os.path.join(args.outdir, 'model')) opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(0.5)) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = PPO( model, opt, gpu=args.gpu, phi=phi, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps=0.1, clip_eps_vf=None, standardize_advantages=True, entropy_coef=1e-2, recurrent=args.recurrent, ) if args.load_agent: agent.load(args.load_agent) if (args.mode=='train'): step_hooks = [] # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.alpha = value step_hooks.append( experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter)) experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(False), eval_env=make_batch_env(True), outdir=args.outdir, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, step_offset=args.step_offset, checkpoint_freq=args.checkpoint_frequency, eval_interval=args.eval_interval, log_interval=args.log_interval, save_best_so_far_agent=False, step_hooks=step_hooks, log_type=args.log_type ) elif (args.mode=='check'): return tools.make_video.check(env=make_env_check(),agent=agent,save_mp4=args.save_mp4) elif (args.mode=='growth'): return tools.make_video.growth(env=make_env_check(),agent=agent,outdir=args.outdir,max_num=args.max_frames,save_mp4=args.save_mp4)
"wrapper_config.TimeLimit.max_episode_steps") obs_space = env.observation_space action_space = env.action_space model = A3CFFGaussian(obs_space.low.size, action_space, bound_mean=False, normalize_obs=False) opt = chainer.optimizers.Adam(alpha=3e-4, eps=1e-5) opt.setup(model) agent = PPO(model, opt, gpu=-1, phi=phi, update_interval=2048, minibatch_size=64, epochs=10, clip_eps_vf=None, entropy_coef=0.0, standardize_advantages=False) agent.load("parameters") ACTION_MEANINGS = { 0: 'Hip1(Torque/Velocity)', 1: 'Knee1(Torque/Velocity)', 2: 'Hip2(Torque/Velocity)', 3: 'Knee2(Torque/Velocity)', } launch_visualizer(agent, env, ACTION_MEANINGS)
def __init__(self, args, sample_env): obs_space = sample_env.observation_space action_space = sample_env.action_space # Normalize observations based on their empirical mean and variance obs_normalizer = chainerrl.links.EmpiricalNormalization( obs_space.low.size, clip_threshold=5) # Switch policy types accordingly to action space types if args.arch == 'FFSoftmax': #model = A3CFFSoftmax(obs_space.low.size, action_space.n) model = A3CFFSoftmax(obs_space.low.size, sample_env.env_prop.get_softmax_layer_size(), n_hidden_channels=600, beta=cfg.beta) elif args.arch == 'FFMellowmax': model = A3CFFMellowmax(obs_space.low.size, action_space.n) elif args.arch == 'FFGaussian': model = A3CFFGaussian(obs_space.low.size, action_space, bound_mean=args.bound_mean, n_hidden_channels=cfg.n_hidden_channels) elif args.arch == 'FFParamSoftmax': model = A3CFFParamSoftmax( obs_space.low.size, sample_env.env_prop.get_pre_output_layer_size(), sample_env.env_prop.get_parametric_segments(), sample_env.env_prop.get_parametric_softmax_segments_sizes(), n_hidden_channels=600, beta=cfg.beta) else: raise NotImplementedError opt = chainer.optimizers.Adam(alpha=args.adam_lr, eps=1e-5) opt.setup(model) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) # a workaround for saving obs_normalizer # see https://github.com/chainer/chainerrl/issues/376 if 'obs_normalizer' not in PPO.saved_attributes: PPO.saved_attributes.append('obs_normalizer') agent = PPO( model, opt, obs_normalizer=obs_normalizer, gpu=args.gpu, phi=lambda x: x.astype(np.float32, copy=False), gamma=args.ppo_gamma, lambd=args.ppo_lambda, update_interval=args.ppo_update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages, ) if args.load: agent.load(args.load) self._agent = agent
def __init__(self, layout_config, agent_params, train, finger_two, verbose=False): self.logger = logging.getLogger(__name__) self.layout_config = layout_config self.agent_params = agent_params self.train_model = train self.finger_two = finger_two self.verbose = verbose if finger_two: self.env = SupervisorEnvironment_(self.layout_config, self.agent_params, self.train_model) else: self.env = SupervisorEnvironment(self.layout_config, self.agent_params, self.train_model) optimizer_name = 'Adam' if agent_params is None else agent_params[ 'supervisor']['optimizer_name'] lr = 0.001 if agent_params is None else agent_params['supervisor'][ 'learning_rate'] n_units = 512 if agent_params is None else int( agent_params['supervisor']['n_units']) device_id = 0 if agent_params is None else int( agent_params['supervisor']['device_id']) pre_load = False if agent_params is None else bool( agent_params['supervisor']['pre_load']) self.gpu = True if agent_params is None else bool( agent_params['supervisor']['gpu']) self.save_path = path.join('data', 'models', 'supervisor') if agent_params is None \ else agent_params['supervisor']['save_path'] self.episodes = 1000000 if agent_params is None else int( agent_params['supervisor']['episodes']) self.log_interval = 1000 if agent_params is None else int( agent_params['supervisor']['log_interval']) self.log_filename = agent_params['supervisor']['log_file'] winit_last = chainer.initializers.LeCunNormal(1e-2) self.model = chainer.Sequential( L.Linear(None, n_units), F.relu, L.Linear(None, n_units), F.relu, chainerrl.links.Branched( chainer.Sequential( L.Linear(None, self.env.action_space.n, initialW=winit_last), chainerrl.distribution.SoftmaxDistribution, ), L.Linear(None, 1))) if pre_load: serializers.load_npz( path.join(self.save_path, 'best', 'model.npz'), self.model) if self.gpu: self.model.to_gpu(device_id) if optimizer_name == 'Adam': self.optimizer = chainer.optimizers.Adam(alpha=lr) elif optimizer_name == 'RMSprop': self.optimizer = chainer.optimizers.RMSprop(lr=lr) else: self.optimizer = chainer.optimizers.MomentumSGD(lr=lr) self.optimizer.setup(self.model) self.optimizer.add_hook(chainer.optimizer.GradientClipping(1.0)) phi = lambda x: x.astype(np.float32, copy=False) self.agent = PPO( self.model, self.optimizer, phi=phi, update_interval=1000, standardize_advantages=True, entropy_coef=1e-2, recurrent=False, ) if train: chainer.config.train = True if self.verbose: self.pbar = tqdm.tqdm(total=self.episodes, ascii=True, bar_format='{l_bar}{n}, {remaining}\n') else: self.pbar = tqdm.tqdm(total=self.episodes) else: chainer.config.train = False self.agent.act_deterministically = False
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--env', type=str, default='Hopper-v2') parser.add_argument('--num-envs', type=int, default=1) parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--steps', type=int, default=10**6) parser.add_argument('--eval-interval', type=int, default=10000) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--reward-scale-factor', type=float, default=1e-2) parser.add_argument('--standardize-advantages', action='store_true') parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--lr', type=float, default=3e-4) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logger-level', type=int, default=logging.DEBUG) parser.add_argument('--monitor', action='store_true') parser.add_argument('--window-size', type=int, default=100) parser.add_argument('--update-interval', type=int, default=2048) parser.add_argument('--log-interval', type=int, default=1000) parser.add_argument('--batchsize', type=int, default=64) parser.add_argument('--epochs', type=int, default=10) parser.add_argument('--entropy-coef', type=float, default=0.0) args = parser.parse_args() logging.basicConfig(level=args.logger_level) # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu, )) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(process_idx, test): env = gym.make(args.env) # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = chainerrl.wrappers.Monitor(env, args.outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if args.render: env = chainerrl.wrappers.Render(env) return env def make_batch_env(test): return chainerrl.envs.MultiprocessVectorEnv([ functools.partial(make_env, idx, test) for idx, env in enumerate(range(args.num_envs)) ]) # Only for getting timesteps, and obs-action spaces sample_env = gym.make(args.env) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space # Normalize observations based on their empirical mean and variance obs_normalizer = chainerrl.links.EmpiricalNormalization(obs_space.low.size, clip_threshold=5) winit_last = chainer.initializers.LeCunNormal(1e-2) # Switch policy types accordingly to action space types if isinstance(action_space, gym.spaces.Discrete): n_actions = action_space.n policy = chainer.Sequential( L.Linear(None, 64), F.tanh, L.Linear(None, 64), F.tanh, L.Linear(None, n_actions, initialW=winit_last), chainerrl.distribution.SoftmaxDistribution, ) elif isinstance(action_space, gym.spaces.Box): action_size = action_space.low.size policy = chainer.Sequential( L.Linear(None, 64), F.tanh, L.Linear(None, 64), F.tanh, L.Linear(None, action_size, initialW=winit_last), chainerrl.policies.GaussianHeadWithStateIndependentCovariance( action_size=action_size, var_type='diagonal', var_func=lambda x: F.exp(2 * x), # Parameterize log std var_param_init=0, # log std = 0 => std = 1 ), ) else: print("""\ This example only supports gym.spaces.Box or gym.spaces.Discrete action spaces.""" ) # NOQA return vf = chainer.Sequential( L.Linear(None, 64), F.tanh, L.Linear(None, 64), F.tanh, L.Linear(None, 1), ) # Combine a policy and a value function into a single model model = chainerrl.links.Branched(policy, vf) opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5) opt.setup(model) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = PPO( model, opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages, ) if args.load: agent.load(args.load) if args.demo: env = make_batch_env(True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.alpha = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(False), eval_env=make_batch_env(True), outdir=args.outdir, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, log_interval=args.log_interval, return_window_size=args.window_size, max_episode_len=timestep_limit, save_best_so_far_agent=False, step_hooks=[ lr_decay_hook, ], )
class SupervisorAgent(Agent): def __init__(self, layout_config, agent_params, train, finger_two, verbose=False): self.logger = logging.getLogger(__name__) self.layout_config = layout_config self.agent_params = agent_params self.train_model = train self.finger_two = finger_two self.verbose = verbose if finger_two: self.env = SupervisorEnvironment_(self.layout_config, self.agent_params, self.train_model) else: self.env = SupervisorEnvironment(self.layout_config, self.agent_params, self.train_model) optimizer_name = 'Adam' if agent_params is None else agent_params[ 'supervisor']['optimizer_name'] lr = 0.001 if agent_params is None else agent_params['supervisor'][ 'learning_rate'] n_units = 512 if agent_params is None else int( agent_params['supervisor']['n_units']) device_id = 0 if agent_params is None else int( agent_params['supervisor']['device_id']) pre_load = False if agent_params is None else bool( agent_params['supervisor']['pre_load']) self.gpu = True if agent_params is None else bool( agent_params['supervisor']['gpu']) self.save_path = path.join('data', 'models', 'supervisor') if agent_params is None \ else agent_params['supervisor']['save_path'] self.episodes = 1000000 if agent_params is None else int( agent_params['supervisor']['episodes']) self.log_interval = 1000 if agent_params is None else int( agent_params['supervisor']['log_interval']) self.log_filename = agent_params['supervisor']['log_file'] winit_last = chainer.initializers.LeCunNormal(1e-2) self.model = chainer.Sequential( L.Linear(None, n_units), F.relu, L.Linear(None, n_units), F.relu, chainerrl.links.Branched( chainer.Sequential( L.Linear(None, self.env.action_space.n, initialW=winit_last), chainerrl.distribution.SoftmaxDistribution, ), L.Linear(None, 1))) if pre_load: serializers.load_npz( path.join(self.save_path, 'best', 'model.npz'), self.model) if self.gpu: self.model.to_gpu(device_id) if optimizer_name == 'Adam': self.optimizer = chainer.optimizers.Adam(alpha=lr) elif optimizer_name == 'RMSprop': self.optimizer = chainer.optimizers.RMSprop(lr=lr) else: self.optimizer = chainer.optimizers.MomentumSGD(lr=lr) self.optimizer.setup(self.model) self.optimizer.add_hook(chainer.optimizer.GradientClipping(1.0)) phi = lambda x: x.astype(np.float32, copy=False) self.agent = PPO( self.model, self.optimizer, phi=phi, update_interval=1000, standardize_advantages=True, entropy_coef=1e-2, recurrent=False, ) if train: chainer.config.train = True if self.verbose: self.pbar = tqdm.tqdm(total=self.episodes, ascii=True, bar_format='{l_bar}{n}, {remaining}\n') else: self.pbar = tqdm.tqdm(total=self.episodes) else: chainer.config.train = False self.agent.act_deterministically = False def train(self, episodes): """ Trains the model for given number of episodes. """ progress_bar = ProgressBar(self.pbar, episodes) experiments.train_agent_with_evaluation( self.agent, self.env, steps=episodes, # Train the agent for 2000 steps eval_n_steps=None, # We evaluate for episodes, not time eval_n_episodes=10, # 10 episodes are sampled for each evaluation train_max_episode_len=100, # Maximum length of each episode eval_interval=self. log_interval, # Evaluate the agent after every 1000 steps step_hooks=[progress_bar], # add hooks logger=self.logger, outdir=self.save_path) # Save everything to 'supervisor' directory def evaluate(self, sentence, batch, n_users, **kwargs): """ Function to evaluate trained agent. :param sentence: sentence to type. :param batch: run evaluation in batch mode. :param n_users: number of users to simulate. """ done = False if not (sentence == "" or sentence is None): self.env.sentences = [sentence] self.env.sentences_bkp = [sentence] if batch: sentence_agg_data = [[ "sentence.id", "agent.id", "target.sentence", "wpm", "lev.distance", "gaze.shift", "bs", "immediate.bs", "delayed.bs", "gaze.keyboard.ratio", "fix.count", "finger.travel", "iki", "correct.error", "uncorrected.error", "fix.duration", "chunk.length" ]] if self.verbose: iter = tqdm.tqdm(iterable=range(n_users), ascii=True, bar_format='{l_bar}{n}, {remaining}\n') else: iter = tqdm.tqdm(range(n_users)) for i in iter: if self.finger_two: self.env = SupervisorEnvironment_(self.layout_config, self.agent_params, self.train_model) else: self.env = SupervisorEnvironment(self.layout_config, self.agent_params, self.train_model) self.env.agent_id = i # reinitialise random seed. np.random.seed(datetime.now().microsecond) random.seed(datetime.now().microsecond) while len(self.env.sentences) > 0: state = self.env.reset() done = False while not done: action = self.agent.act(state) state, reward, done, info = self.env.step(action) sentence_agg_data += self.env.sentence_test_data with open(path.join("data", "output", "SupervisorAgent_sentence_test.csv"), "w", newline="", encoding='utf-8') as f: writer = csv.writer(f) writer.writerows(sentence_agg_data) if not self.finger_two: with open(path.join("data", "output", "SupervisorAgent_Vision_Viz.csv"), "w", newline="") as f: writer = csv.writer(f) writer.writerows(self.env.eye_viz_log) with open(path.join("data", "output", "SupervisorAgent_Finger_Viz.csv"), "w", newline="") as f: writer = csv.writer(f) writer.writerows(self.env.finger_viz_log) with open(path.join("data", "output", "SupervisorAgent_Typing_Viz.csv"), "w", newline="") as f: writer = csv.writer(f) writer.writerows(self.env.typing_viz_log) else: self.env.sentence_test_data.append([ "sentence.id", "agent.id", "target.sentence", "wpm", "lev.distance", "gaze.shift", "bs", "immediate.bs", "delayed.bs", "gaze.keyboard.ratio", "fix.count", "finger.travel", "iki", "correct.error", "uncorrected.error", "fix.duration", "chunk.length" ]) state = self.env.reset() while not done: action = self.agent.act(state) state, reward, done, info = self.env.step(action) with open(path.join("data", "output", "SupervisorAgent_vision_test.csv"), "w", newline="") as f: writer = csv.writer(f) writer.writerows(self.env.eye_test_data) with open(path.join("data", "output", "SupervisorAgent_finger_test.csv"), "w", newline="") as f: writer = csv.writer(f) writer.writerows(self.env.finger_test_data) with open(path.join("data", "output", "SupervisorAgent_sentence_test.csv"), "w", newline="", encoding='utf-8') as f: writer = csv.writer(f) writer.writerows(self.env.sentence_test_data) # TODO: This is from legacy code. Need to update. visualise_agent( True, True, path.join("data", "output", "SupervisorAgent_vision_test.csv"), path.join("data", "output", "SupervisorAgent_finger_test.csv"), path.join("data", "output", "SupervisorAgent.mp4")) self.save_senetence_agg_data( path.join("data", "output", "SupervisorAgent_sentence_test.csv")) self.save_user_agg_data( path.join("data", "output", "SupervisorAgent_sentence_test.csv")) def save_senetence_agg_data(self, filename): """ generates sentence level aggregate data. :param filename: raw data file path. """ data = pd.read_csv(filename, sep=',', encoding='utf-8') data = data.groupby("target.sentence").agg(['mean', 'std']) data.to_csv(path.join("data", "output", "SupervisorAgent_sentence_aggregate.csv"), encoding='utf-8') def save_user_agg_data(self, filename): """ generates user level aggregate data. :param filename: raw data file path. """ data = pd.read_csv(filename, sep=',', encoding='utf-8') data = data.groupby("agent.id").agg(['mean', 'std']) data.to_csv(path.join("data", "output", "SupervisorAgent_user_aggregate.csv"), encoding='utf-8')
class rl_stock_trader(): def __init__(self): run_name = 'run_test' self.outdir = './results/' + run_name + '/' self.outdir_train = self.outdir + 'train/' self.outdir_test = self.outdir + 'test/' self.training_counter = 0 try: sys.makedirs(self.outdir_train) sys.makedirs(self.outdir_test) except Exception: pass self.writer_train = SummaryWriter(self.outdir_train) self.writer_test = SummaryWriter(self.outdir_test) self.monitor_freq = 100 self.testing_samples = 100 self.validation_scores = [] self.training_scores = [] self.settings = { 'past_horzion': 100, 'max_steps': 365, 'inital_account_balance': 1e4, 'stop_below_balance': 1e3, 'transation_fee': .1, 'years_training': 5, 'years_testing': 1, } testing_end = date.today() testing_beginning = testing_end - relativedelta( years=self.settings['years_testing']) - relativedelta( days=self.settings['past_horzion']) training_end = testing_beginning - relativedelta(days=1) training_beginning = training_end - relativedelta( years=self.settings['years_training']) - relativedelta( days=self.settings['past_horzion']) self.data = { 'train_gold': self.get_prices(gold_shanghai, 1, training_beginning, training_end), 'train_copper': self.get_prices(copper_shanghai, 1, training_beginning, training_end), 'train_aluminum': self.get_prices(aluminum_shanghai, 1, training_beginning, training_end), 'test_gold': self.get_prices(gold_shanghai, 1, testing_beginning, testing_end), 'test_copper': self.get_prices(copper_shanghai, 1, testing_beginning, testing_end), 'test_aluminum': self.get_prices(aluminum_shanghai, 1, testing_beginning, testing_end), 'test_soybean_oil': self.get_prices(soybean_oil, 1, testing_beginning, testing_end), 'test_dax_futures': self.get_prices(dax_futures, 1, testing_beginning, testing_end), 'test_corn': self.get_prices(corn, 1, testing_beginning, testing_end), 'test_canadian_dollar': self.get_prices(canadian_dollar, 1, testing_beginning, testing_end), } # print('\n\n*************\n', self.data['test_corn'], '\n\n') self.env_test_gold = StockTradingEnv(self.get_prices( gold_shanghai, 1, testing_beginning, testing_end), self.settings, test=True) self.env_test_copper = StockTradingEnv(self.get_prices( copper_shanghai, 1, testing_beginning, testing_end), self.settings, test=True) self.env_test_aluminum = StockTradingEnv(self.get_prices( aluminum_shanghai, 1, testing_beginning, testing_end), self.settings, test=True) self.env_test_soy_bean = StockTradingEnv(self.get_prices( soybean_oil, 1, testing_beginning, testing_end), self.settings, test=True) self.env_test_dax = StockTradingEnv(self.get_prices( dax_futures, 1, testing_beginning, testing_end), self.settings, test=True) self.env_test_corn = StockTradingEnv(self.get_prices( corn, 1, testing_beginning, testing_end), self.settings, test=True) self.env_test_canadian_dollar = StockTradingEnv(self.get_prices( canadian_dollar, 1, testing_beginning, testing_end), self.settings, test=True) self.env_train = StockTradingEnv(self.data['train_gold'], self.settings, test=False) # self.env_test = StockTradingEnv(self.data['test_gold'], self.settings, test=True) self.test_envs = { 'gold': StockTradingEnv(self.data['test_gold'], self.settings, test=True), 'copper': StockTradingEnv(self.data['test_copper'], self.settings, test=True), 'aluminum': StockTradingEnv(self.data['test_aluminum'], self.settings, test=True), } self.agent = self.rl_agent(self.env_train) def get_prices(self, index, depth, start, end): data_prices = quandl.get(index + str(depth), start_date=start, end_date=end) data_prices.index = pd.to_datetime(data_prices.index) return data_prices def rl_agent(self, env): # self.policy = chainer.Sequential( # L.BatchNormalization(axis=0), # L.Linear(None, 256), # # F.dropout(ratio=.5), # F.tanh, # L.Linear(None, 128), # # F.dropout(ratio=.5), # F.tanh, # # L.Linear(None, env.action_space.low.size, initialW=winit_last), # L.Linear(None, env.action_space.low.size), # # F.sigmoid, # chainerrl.policies.GaussianHeadWithStateIndependentCovariance( # action_size=env.action_space.low.size, # var_type='diagonal', # var_func=lambda x: F.exp(2 * x), # Parameterize log std # # var_param_init=0, # log std = 0 => std = 1 # )) self.policy = chainer.Sequential( L.BatchNormalization(axis=0), L.Linear(None, 256), # F.dropout(ratio=.5), F.sigmoid, # F.relu, L.Linear(None, 128), # F.dropout(ratio=.5), F.sigmoid, # L.Linear(None, env.action_space.low.size, initialW=winit_last), L.Linear(None, env.action_space.low.size), F.sigmoid, chainerrl.policies.GaussianHeadWithStateIndependentCovariance( action_size=env.action_space.low.size, var_type='diagonal', var_func=lambda x: F.exp(2 * x), # Parameterize log std # var_param_init=0, # log std = 0 => std = 1 )) self.vf = chainer.Sequential( L.BatchNormalization(axis=0), L.Linear(None, 256), # F.dropout(ratio=.5), F.sigmoid, L.Linear(None, 128), # F.dropout(ratio=.5), F.sigmoid, L.Linear(None, 1), F.sigmoid, ) # self.vf = chainer.Sequential( # L.BatchNormalization(axis=0), # L.Linear(None, 256), # # F.dropout(ratio=.5), # F.tanh, # L.Linear(None, 128), # # F.dropout(ratio=.5), # F.tanh, # L.Linear(None, 1), # ) # Combine a policy and a value function into a single model self.model = chainerrl.links.Branched(self.policy, self.vf) self.opt = chainer.optimizers.Adam(alpha=3e-3, eps=1e-5) self.opt.setup(self.model) self.agent = PPO( self.model, self.opt, # obs_normalizer=obs_normalizer, gpu=-1, update_interval=64, minibatch_size=32, clip_eps_vf=None, entropy_coef=0.001, # standardize_advantages=args.standardize_advantages, ) return self.agent def monitor_training(self, tb_writer, t, i, done, action, monitor_data, counter): if t == 0 or i == 0: self.cash_dummy = [] self.equity_dummy = [] self.shares_dummy = [] self.shares_value_dummy = [] self.action_dummy = [] self.action_prob_dummy = [] self.cash_dummy.append(monitor_data['cash']) self.equity_dummy.append(monitor_data['equity']) self.shares_dummy.append(monitor_data['shares_held']) self.shares_value_dummy.append(monitor_data['value_in_shares']) self.action_dummy.append(monitor_data['action']) self.action_prob_dummy.append(monitor_data['action_prob']) # if done: # tb_writer.add_scalar('cash', np.mean(self.cash_dummy), counter) # tb_writer.add_scalar('equity', np.mean(self.equity_dummy), counter) # tb_writer.add_scalar('shares_held', np.mean(self.shares_dummy), counter) # tb_writer.add_scalar('shares_value', np.mean(self.shares_value_dummy), counter) # tb_writer.add_scalar('action', np.mean(self.action_dummy), counter) # tb_writer.add_histogram('action_prob', np.mean(self.action_prob_dummy), counter) def plot_validation_figures(self, index, name, test_data_label, benchmark): if name in ['mean', 'max', 'final']: ylimits = [.75 * np.amin(benchmark), 1.5 * np.amax(benchmark)] elif name == 'min': ylimits = [0., self.settings['inital_account_balance']] plotcolor = 'darkgreen' plt.figure(figsize=(18, 18)) plt.scatter( np.asarray(self.validation_scores)[:, 0], np.asarray(self.validation_scores)[:, index]) plt.grid() plt.ylim(ylimits[0], ylimits[1]) plt.title(name + ' equity statistics over 1 year') plt.xlabel('trained episodes') plt.ylabel('equity [$]') plt.savefig(self.outdir + test_data_label + '/scatter_' + name + '_equity.pdf') plt.close() area_plots = [] box_data = [] for j in range(len(np.unique(np.asarray(self.validation_scores)[:, 0]))): dummy = np.asarray(self.validation_scores)[:, index][np.where( np.asarray(self.validation_scores)[:, 0] == np.unique( np.asarray(self.validation_scores)[:, 0])[j])] box_data.append(dummy) area_plots.append([ np.percentile(dummy, 5), np.percentile(dummy, 25), np.percentile(dummy, 50), np.percentile(dummy, 75), np.percentile(dummy, 95), ]) area_plots = np.asarray(area_plots) p05 = area_plots[:, 0] p25 = area_plots[:, 1] p50 = area_plots[:, 2] p75 = area_plots[:, 3] p95 = area_plots[:, 4] plt.figure(figsize=(18, 18)) plt.fill_between(np.arange(area_plots.shape[0]), p05, p95, facecolor=plotcolor, alpha=.3) plt.fill_between(np.arange(area_plots.shape[0]), p25, p75, facecolor=plotcolor, alpha=.8) plt.plot(p50, linewidth=3, color='lightblue') plt.ylim(ylimits[0], ylimits[1]) plt.grid() plt.title(name + ' equity statistics over 1 year') plt.xlabel('trained episodes') plt.ylabel('equity [$]') plt.savefig(self.outdir + test_data_label + '/area_' + name + '_equity.pdf') plt.close() plt.figure(figsize=(18, 18)) plt.boxplot( box_data, notch=True, labels=None, boxprops=dict(color=plotcolor, linewidth=2), capprops=dict(color=plotcolor), whiskerprops=dict(color=plotcolor), flierprops=dict(color=plotcolor, markeredgecolor=plotcolor, markerfacecolor=plotcolor), medianprops=dict(color='lightblue', linewidth=2), ) plt.ylim(ylimits[0], ylimits[1]) plt.grid() plt.title('equity statistics over 1 year') plt.xlabel('trained episodes') plt.ylabel('equity [$]') plt.savefig(self.outdir + test_data_label + '/box_' + name + '_equity.pdf') plt.close() def validate(self, episode, counter, test_data_label): try: os.mkdir(self.outdir + test_data_label + '/') except Exception: pass test_equity = [] test_trades_buy = [] test_trades_sell = [] test_data = self.data['test_' + test_data_label] try: benchmark = test_data['Close'].values[self. settings['past_horzion']:] except KeyError: benchmark = test_data['Settle'].values[self. settings['past_horzion']:] benchmark /= benchmark[0] benchmark *= self.settings['inital_account_balance'] plt.figure(figsize=(18, 18)) for i in range(0, self.testing_samples): if test_data_label == 'gold': obs = self.env_test_gold.reset() if test_data_label == 'copper': obs = self.env_test_copper.reset() if test_data_label == 'aluminum': obs = self.env_test_aluminum.reset() if test_data_label == 'soybean_oil': obs = self.env_test_soy_bean.reset() if test_data_label == 'dax_futures': obs = self.env_test_dax.reset() if test_data_label == 'corn': obs = self.env_test_corn.reset() if test_data_label == 'corn': obs = self.env_test_corn.reset() if test_data_label == 'canadian_dollar': obs = self.env_test_canadian_dollar.reset() # obs = self.env_test.reset() reward = 0 done = False R = 0 t = 0 while not done: action = self.agent.act(obs) if test_data_label == 'gold': obs, reward, done, _, monitor_data = self.env_test_gold.step( action) if test_data_label == 'copper': obs, reward, done, _, monitor_data = self.env_test_copper.step( action) if test_data_label == 'aluminum': obs, reward, done, _, monitor_data = self.env_test_aluminum.step( action) if test_data_label == 'soybean_oil': obs, reward, done, _, monitor_data = self.env_test_soy_bean.step( action) if test_data_label == 'dax_futures': obs, reward, done, _, monitor_data = self.env_test_dax.step( action) if test_data_label == 'corn': obs, reward, done, _, monitor_data = self.env_test_corn.step( action) if test_data_label == 'canadian_dollar': obs, reward, done, _, monitor_data = self.env_test_canadian_dollar.step( action) # obs, reward, done, _, monitor_data = self.env_test.step(action) test_equity.append(monitor_data['equity']) action_choice = np.argmax(softmax(action)) action_confidence = softmax(action)[action_choice] if action_confidence > .8: if action_choice == 0: test_trades_buy.append([t, monitor_data['equity']]) if action_choice == 2: test_trades_sell.append([t, monitor_data['equity']]) self.monitor_training(self.writer_test, t, i, done, action, monitor_data, counter) R += reward t += 1 if done: test_equity = test_equity[:-1] plt.plot(test_equity[:-1], linewidth=1) # try: # plt.scatter(np.asarray(test_trades_buy)[:,0], np.asarray(test_trades_buy)[:,1], marker='X', c='green', s=5) # plt.scatter(np.asarray(test_trades_sell)[:,0], np.asarray(test_trades_sell)[:,1], marker='X', c='red', s=5) # except IndexError: # pass self.validation_scores.append([ counter, np.mean(test_equity), np.amin(test_equity), np.amax(test_equity), test_equity[-1] ]) test_equity = [] self.agent.stop_episode() time_axis = test_data.index[self.settings['past_horzion']:].date time_axis_short = time_axis[::10] plt.plot(benchmark, linewidth=3, color='k', label='close') plt.ylim(.75 * np.amin(benchmark), 1.5 * np.amax(benchmark)) plt.xticks(np.linspace(0, len(time_axis), len(time_axis_short) - 1), time_axis_short, rotation=90) plt.grid() plt.title(test_data_label + ' validation runs at episode ' + str(episode)) plt.xlabel('episode') plt.ylabel('equity [$]') plt.legend() plt.savefig(self.outdir + test_data_label + '/validation_E' + str(episode) + '.pdf') plt.close() self.plot_validation_figures(1, 'mean', test_data_label, benchmark) self.plot_validation_figures(2, 'min', test_data_label, benchmark) self.plot_validation_figures(3, 'max', test_data_label, benchmark) self.plot_validation_figures(4, 'final', test_data_label, benchmark) def train(self): print('\nstart training loop\n') def check_types(input, inputname): if np.isnan(input).any(): print('----> ', inputname, ' array contains NaN\n', np.isnan(input).shape, '\n') if np.isinf(input).any(): print('----> ', inputname, ' array contains inf\n', np.isinf(input).shape, '\n') n_episodes = int(1e5) log_data = [] action_log = [] debug_printing = False for i in range(0, n_episodes + 1): obs = self.env_train.reset() reward = 0 done = False R = 0 # return (sum of rewards) t = 0 # time step while not done: # self.env.render() action = self.agent.act_and_train(obs, reward) obs, reward, done, _, monitor_data = self.env_train.step( action) self.monitor_training(self.writer_train, t, i, done, action, monitor_data, self.training_counter) R += reward t += 1 if t % 10 == 0 and not done: log_data.append({ 'equity': int(monitor_data['equity']), 'shares_held': int(monitor_data['shares_held']), 'shares_value': int(monitor_data['value_in_shares']), 'cash': int(monitor_data['cash']), 't': int(t), }) action_log.append([ self.training_counter, action[0], action[1], action[2] ]) if done: if i % 10 == 0: print('\nrollout ' + str(i) + '\n', pd.DataFrame(log_data).max()) log_data = [] self.training_scores.append([i, R]) self.training_counter += 1 self.agent.stop_episode() if i % self.monitor_freq == 0: # self.agent.stop_episode_and_train(obs, reward, done) # print('\n\nvalidation...') self.validate(i, self.training_counter, 'gold') if debug_printing: print('\n\n****************\nSOY BEANS\n\n') self.validate(i, self.training_counter, 'soybean_oil') if debug_printing: print('\n\n****************\nCORN\n\n') self.validate(i, self.training_counter, 'corn') # if debug_printing: print('\n\n****************\nCANADIAN DOLLAR\n\n') # self.validate(i, self.training_counter, 'canadian_dollar') if debug_printing: print('\n****************\n') act_probs = softmax(np.asarray(action_log)[:, 1:], axis=1) plt.figure() plt.scatter(np.asarray(self.training_scores)[:, 0], np.asarray(self.training_scores)[:, 1], s=2, label='reward') plt.legend() plt.title('reward') plt.grid() plt.savefig(self.outdir + 'reward.pdf') plt.close() plt.figure() plt.scatter(np.asarray(action_log)[:, 0], act_probs[:, 0], label='action0') plt.scatter(np.asarray(action_log)[:, 0], act_probs[:, 1], label='action1') plt.scatter(np.asarray(action_log)[:, 0], act_probs[:, 2], label='action2') plt.legend() plt.title('actions') plt.grid() plt.savefig(self.outdir + 'actions.pdf') plt.close() plt.figure() plt.plot(np.asarray(action_log)[:, 0], act_probs[:, 0], label='action0') plt.plot(np.asarray(action_log)[:, 0], act_probs[:, 1], label='action1') plt.plot(np.asarray(action_log)[:, 0], act_probs[:, 2], label='action2') plt.legend() plt.title('actions') plt.grid() plt.savefig(self.outdir + 'actions_plot.pdf') plt.close() if i % 10 == 0 and i > 0: self.agent.save(self.outdir) serializers.save_npz(self.outdir + 'model.npz', self.model) # if i % 1000 == 0: # print('\nepisode:', i, ' | episode length: ', t, '\nreward:', R, # '\nstatistics:', self.agent.get_statistics(), '\n') self.agent.stop_episode_and_train(obs, reward, done) print('Finished.')
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('--gpu', type=int, default=-1) parser.add_argument('--env', type=str, default='Hopper-v2') parser.add_argument('--num-envs', type=int, default=1) parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--steps', type=int, default=10**6) parser.add_argument('--eval-interval', type=int, default=10000) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--reward-scale-factor', type=float, default=1e-2) parser.add_argument('--standardize-advantages', action='store_true') parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--lr', type=float, default=3e-4) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logger-level', type=int, default=logging.DEBUG) parser.add_argument('--monitor', action='store_true') parser.add_argument('--window-size', type=int, default=100) parser.add_argument('--update-interval', type=int, default=2048) parser.add_argument('--log-interval', type=int, default=1000) parser.add_argument('--batchsize', type=int, default=64) parser.add_argument('--epochs', type=int, default=10) parser.add_argument('--entropy-coef', type=float, default=0.0) args = parser.parse_args() logging.basicConfig(level=args.logger_level) # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu, )) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 args.outdir = experiments.prepare_output_dir(args, args.outdir) # def make_env(process_idx, test): # env = gym.make(args.env) # # Use different random seeds for train and test envs # process_seed = int(process_seeds[process_idx]) # env_seed = 2 ** 32 - 1 - process_seed if test else process_seed # env.seed(env_seed) # # Cast observations to float32 because our model uses float32 # env = chainerrl.wrappers.CastObservationToFloat32(env) # if args.monitor: # env = chainerrl.wrappers.Monitor(env, args.outdir) # if not test: # # Scale rewards (and thus returns) to a reasonable range so that # # training is easier # env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) # if args.render: # env = chainerrl.wrappers.Render(env) # return env def make_env(test): env = gym.make( "DaktyPushingSimulationEnv-v0", level=5, simulation_backend="mujoco", control_frequency_in_hertz=100, state_space_components_to_be_used=None, alternate_env_object=None, discretization_factor_torque_control_space=None, model_as_function_for_pixel_to_latent_space_parsing=(None, None)) # print('\n############\n', env, '\n############\n') env.unwrapped.finger.set_resolution_quality('low') # print('\n############\n', env, '\n############\n') env = gym.wrappers.TimeLimit(env) # print('\n############\n', env, '\n############\n') # Unwrap TimeLimit wrapper assert isinstance(env, gym.wrappers.TimeLimit) env = env.env # Use different random seeds for train and test envs # env_seed = 2 ** 32 - 1 - args.seed if test else args.seed # env.seed(env_seed) process_seed = 420 env_seed = 2**32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = chainerrl.wrappers.Monitor(env, args.outdir) if args.render and not test: env = chainerrl.wrappers.Render(env) return env def make_batch_env(test): return chainerrl.envs.MultiprocessVectorEnv([ functools.partial(make_env, idx, test) for idx, env in enumerate(range(args.num_envs)) ]) # Only for getting timesteps, and obs-action spaces sample_env = make_env(0) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space print('\n\n------------------- obs_space: ', obs_space.shape, '\n\n\n') # Normalize observations based on their empirical mean and variance obs_normalizer = chainerrl.links.EmpiricalNormalization(obs_space.low.size, clip_threshold=5) winit_last = chainer.initializers.LeCunNormal(1e-2) action_size = action_space.low.size policy = chainer.Sequential( L.Linear(None, 64), F.tanh, L.Linear(None, 64), F.tanh, L.Linear(None, action_size, initialW=winit_last), chainerrl.policies.GaussianHeadWithStateIndependentCovariance( action_size=action_size, var_type='diagonal', var_func=lambda x: F.exp(2 * x), # Parameterize log std var_param_init=0, # log std = 0 => std = 1 )) vf = chainer.Sequential( concat_obs_and_action, L.Linear(None, 64), F.tanh, L.Linear(None, 64), F.tanh, L.Linear(None, 1), ) # Combine a policy and a value function into a single model model = chainerrl.links.Branched(policy, vf) opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5) opt.setup(model) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = PPO( model, opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages, ) if args.load: agent.load(args.load) if args.demo: env = make_env(True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: env = make_env(False) n_episodes = 10000 # pbar = tqdm(total=n_episodes) max_episode_len = 1000 for i in range(1, n_episodes + 1): # pbar.update(1) obs = env.reset() # print('obs inital..............', obs.shape) reward = 0 done = False R = 0 # return (sum of rewards) t = 0 # time step # pbar = tqdm(total=max_episode_len) while not done and t < max_episode_len: # pbar.update(1) # Uncomment to watch the behaviour # env.render() action = agent.act_and_train(obs, reward) # print('action..................', action) obs, reward, done, _ = env.step(action) # print('obs.....................', obs) # print('reward..................', reward) R += reward t += 1 if i % 10 == 0: print('episode:', i, 'R:', R, 'statistics:', agent.get_statistics()) agent.stop_episode_and_train(obs, reward, done) print('Finished.') # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.alpha = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_batch_with_evaluation( agent=agent, env=make_env(False), eval_env=make_env(True), outdir=args.outdir, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, log_interval=args.log_interval, return_window_size=args.window_size, max_episode_len=timestep_limit, save_best_so_far_agent=False, step_hooks=[ lr_decay_hook, ], )
class rl_stock_trader(): def __init__(self, path_to_symbol_csv, request_symbols=8, tb_outdir=tb_outdir): self.writer = SummaryWriter(tb_outdir) self.request_symbols = request_symbols self.monitor_freq = 100 self.start_budget = 10000. index_df = pd.read_csv(path_to_symbol_csv) # symbol_vec = list(index_df.values[:self.request_symbols,0]) symbol_vec = list( index_df.values[np.random.randint(0, index_df.values. shape[0], self.request_symbols), 0]) self.dataframe, self.num_symbols = self.get_data(symbol_vec) # env = DummyVecEnv([lambda: StockTradingEnv(dataframe)]) self.env = StockTradingEnv(self.dataframe, self.num_symbols) self.tb_action_type = np.zeros(3) self.tb_action_symbol = np.zeros(self.num_symbols) self.tb_action_vec = [] self.tb_action_amount = [] self.tb_balance = np.zeros(4) self.tb_net_worth = np.zeros(4) self.balance_dummy = [] self.net_worth_dummy = [] self.tb_reward = 0. self.tb_cache_reward_vec = [] self.tb_cache_rollout_vec = [] self.tb_cache_final_net = [] self.tb_cache_final_balance = [] self.tb_chache_balance = np.zeros(4) self.tb_chache_net_worth = np.zeros(4) def get_data(self, symbols, start=None, end=None, period='5y', interval='1d'): ''' valid periods: 1d,5d,1mo,3mo,6mo,1y,2y,5y,10y,ytd,max fetch data by interval (including intraday if period < 60 days) valid intervals: 1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo group by ticker (to access via data['SPY']) (optional, default is 'column') adjust all OHLC automatically download pre/post regular market hours data use threads for mass downloading? (True/False/Integer) proxy URL scheme use use when downloading? ''' df_keys = ['Adj Close', 'Close', 'High', 'Low', 'Open', 'Volume'] if start == None or end == None: print('\nload S&P 500 data for period: ', period, ' and interval: ', interval, '\n') data_array = yf.download(tickers=symbols, period=period, interval=interval, group_by='column', auto_adjust=True, prepost=False, threads=True, proxy=None) else: print('\nload S&P 500 data since: ', start, '/ end: ', end, ' and interval: ', interval, '\n') data_array = yf.download(tickers=symbols, start=start, end=end, interval=interval, group_by='column', auto_adjust=True, prepost=False, threads=True, proxy=None) called_symbols = list(data_array['Volume'].keys()) try: failed_symbols = list(data_array['Adj Close'].keys()) except KeyError: failed_symbols = [] pass loaded_symbols = [] for i in range(len(called_symbols)): if called_symbols[i] not in failed_symbols: loaded_symbols.append(called_symbols[i]) for i in range(len(failed_symbols)): for j in range(len(df_keys)): data_array = data_array.drop( columns=[(str(df_keys[j]), str(failed_symbols[i]))]) data_array.insert(0, 'i', np.arange(data_array.shape[0])) data_index_axis = data_array.index.values data_array = data_array.drop( index=[data_index_axis[0], data_index_axis[-1]]) dfkeys = ['Open', 'Close', 'High', 'Low', 'Volume'] for dfkey in range(len(dfkeys)): data_array[dfkeys[dfkey]].fillna(method='pad') data_array[dfkeys[dfkey]].fillna(0.) data_array[dfkeys[dfkey]].replace(to_replace=np.nan, value=0.) data_array[dfkeys[dfkey]].replace(to_replace='NaN', value=0.) print( '\n------------------------------------\ \nsuccesfully loaded stock data\nnumber of loaded data points: ' , data_array.shape[0], \ '\nnumber of loaded symbols: ', len(loaded_symbols), '/', len(called_symbols), \ '\n------------------------------------\n\n', \ '\ndataframe:\n', data_array, \ '\n------------------------------------\n\n') return data_array, len(loaded_symbols) def monitor_training(self, tb_writer, t, i, done, action, monitor_data): ''' after each episode save: action_type [3 x 1] v action_amount [1 x 1] (avg /t) v action_symbol [num_symbols x 1] v balance [4x1] (low, avg, high, final) v net_worth [4x1] (low, avg, high, final) v ''' if t == 0: self.balance_dummy = [] self.net_worth_dummy = [] self.tb_reward = 0. if i == 0: self.tb_balance = np.zeros(4) self.tb_net_worth = np.zeros(4) self.tb_action_amount = [] self.tb_action_symbol_vec = [] self.tb_action_vec = [] self.tb_cache_reward_vec = [] self.tb_cache_rollout_vec = [] self.tb_cache_final_net = np.zeros(4) self.tb_cache_final_balance = np.zeros(4) self.tb_action_symbol_vec.append(monitor_data['action_sym']) self.tb_action_amount.append(monitor_data['action_amount']) self.tb_action_vec.append(monitor_data['action_type']) self.tb_reward += monitor_data['reward'] self.balance_dummy.append(monitor_data['balance']) self.net_worth_dummy.append(monitor_data['net_worth']) if done: self.tb_cache_reward_vec.append(self.tb_reward) self.tb_balance[0] = np.amin(self.balance_dummy) self.tb_balance[1] = np.mean(self.balance_dummy) self.tb_balance[2] = np.amax(self.balance_dummy) self.tb_balance[3] = self.balance_dummy[-1] self.tb_net_worth[0] = np.amin(self.net_worth_dummy) self.tb_net_worth[1] = np.mean(self.net_worth_dummy) self.tb_net_worth[2] = np.amax(self.net_worth_dummy) self.tb_net_worth[3] = self.net_worth_dummy[-1] self.tb_cache_rollout_vec.append(t) if np.ndim(self.tb_cache_final_balance) == 1: self.tb_cache_final_balance = np.reshape( self.tb_balance, [1, -1]) self.tb_cache_final_net = np.reshape(self.tb_net_worth, [1, -1]) else: self.tb_cache_final_balance = np.concatenate( (self.tb_cache_final_balance, np.reshape(self.tb_balance, [1, -1])), axis=0) self.tb_cache_final_net = np.concatenate( (self.tb_cache_final_net, np.reshape(self.tb_net_worth, [1, -1])), axis=0) if i % self.monitor_freq == 0 and i != 0: tb_writer.add_scalar('training/reward', np.mean(self.tb_cache_reward_vec), i) tb_writer.add_scalar('training/rollout', np.mean(self.tb_cache_rollout_vec), i) tb_writer.add_scalar( 'balance/low', np.mean(self.tb_cache_final_balance[:, 0]), i) tb_writer.add_scalar( 'balance/avg', np.mean(self.tb_cache_final_balance[:, 1]), i) tb_writer.add_scalar( 'balance/high', np.mean(self.tb_cache_final_balance[:, 2]), i) tb_writer.add_scalar( 'balance/final', np.mean(self.tb_cache_final_balance[:, 3]), i) tb_writer.add_scalar('net_worth/low', np.mean(self.tb_cache_final_net[:, 0]), i) tb_writer.add_scalar('net_worth/avg', np.mean(self.tb_cache_final_net[:, 1]), i) tb_writer.add_scalar('net_worth/high', np.mean(self.tb_cache_final_net[:, 2]), i) tb_writer.add_scalar('net_worth/final', np.mean(self.tb_cache_final_net[:, 3]), i) tb_writer.add_scalar( 'net_worth/profit', np.mean(self.tb_cache_final_net[:, 3] - self.start_budget), i) tb_writer.add_histogram('training_stats/reward', np.asarray(self.tb_cache_reward_vec), i) tb_writer.add_histogram('training_stats/rollout', np.asarray(self.tb_cache_rollout_vec), i) tb_writer.add_histogram( 'performance_stats/final_balance', np.asarray(self.tb_cache_final_balance[:, -1]), i) tb_writer.add_histogram( 'performance_stats/final_net_worth', np.asarray(self.tb_cache_final_net[:, -1]), i) tb_writer.add_histogram( 'performance_stats/profit', np.asarray(self.tb_cache_final_net[:, -1] - self.start_budget), i) tb_writer.add_histogram('action/type', np.asarray(self.tb_action_vec), i) tb_writer.add_histogram('action/symbol', np.asarray(self.tb_action_symbol_vec), i) tb_writer.add_histogram('action/action_amount', np.asarray(self.tb_action_amount), i) self.tb_cache_reward_vec = [] self.tb_cache_rollout_vec = [] self.tb_cache_final_net = np.zeros(4) self.tb_cache_final_balance = np.zeros(4) self.tb_action_vec = [] self.tb_action_symbol_vec = [] self.tb_action_amount = [] self.tb_balance = np.zeros(4) self.tb_net_worth = np.zeros(4) def rl_agent(self, env): self.policy = chainer.Sequential( L.Linear(None, 256), F.tanh, L.Linear(None, 128), F.tanh, # L.Linear(None, env.action_space.low.size, initialW=winit_last), L.Linear(None, env.action_space.low.size), # F.sigmoid, chainerrl.policies.GaussianHeadWithStateIndependentCovariance( action_size=env.action_space.low.size, var_type='diagonal', var_func=lambda x: F.exp(2 * x), # Parameterize log std # var_param_init=0, # log std = 0 => std = 1 )) self.vf = chainer.Sequential( L.Linear(None, 256), F.tanh, L.Linear(None, 128), F.tanh, L.Linear(None, 1), ) # Combine a policy and a value function into a single model self.model = chainerrl.links.Branched(self.policy, self.vf) self.opt = chainer.optimizers.Adam(alpha=3e-4, eps=1e-5) self.opt.setup(self.model) self.agent = PPO( self.model, self.opt, # obs_normalizer=obs_normalizer, gpu=-1, update_interval=512, minibatch_size=8, clip_eps_vf=None, entropy_coef=0.001, # standardize_advantages=args.standardize_advantages, ) return self.agent def train(self): print('\nstart training loop\n') def check_types(input, inputname): if np.isnan(input).any(): print('----> ', inputname, ' array contains NaN\n', np.isnan(input).shape, '\n') if np.isinf(input).any(): print('----> ', inputname, ' array contains inf\n', np.isinf(input).shape, '\n') self.agent = self.rl_agent(self.env) n_episodes = 1000000 max_episode_len = 1000 for i in range(0, n_episodes + 1): obs = self.env.reset() reward = 0 done = False R = 0 # return (sum of rewards) t = 0 # time step while not done and t < max_episode_len: # Uncomment to watch the behaviour # self.env.render() action = self.agent.act_and_train(obs, reward) check_types(action, 'action') obs, reward, done, _, monitor_data = self.env.step(action) check_types(obs, 'obs') check_types(reward, 'reward') self.monitor_training(self.writer, t, i, done, action, monitor_data) R += reward t += 1 if done: print(' training at episode ' + str(i), end='\r') if i % 100 == 0 and i > 0: self.agent.save(model_outdir) serializers.save_npz(model_outdir + 'model.npz', self.model) # if i % 1000 == 0: # print('\nepisode:', i, ' | episode length: ', t, '\nreward:', R, # '\nstatistics:', self.agent.get_statistics(), '\n') self.agent.stop_episode_and_train(obs, reward, done) print('Finished.')
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--env', type=str, default='Hopper-v2') parser.add_argument('--num-envs', type=int, default=1) parser.add_argument('--arch', type=str, default='FFGaussian', choices=('FFSoftmax', 'FFMellowmax', 'FFGaussian')) parser.add_argument('--bound-mean', action='store_true') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--steps', type=int, default=10**6) parser.add_argument('--eval-interval', type=int, default=10000) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--reward-scale-factor', type=float, default=1e-2) parser.add_argument('--standardize-advantages', action='store_true') parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--lr', type=float, default=3e-4) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--load', type=str, default='') parser.add_argument('--logger-level', type=int, default=logging.DEBUG) parser.add_argument('--monitor', action='store_true') parser.add_argument('--window-size', type=int, default=100) parser.add_argument('--update-interval', type=int, default=2048) parser.add_argument('--log-interval', type=int, default=1000) parser.add_argument('--batchsize', type=int, default=64) parser.add_argument('--epochs', type=int, default=10) parser.add_argument('--entropy-coef', type=float, default=0.0) args = parser.parse_args() #logging.basicConfig(level=args.logger_level) # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu, )) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(process_idx, test): env = gym.make(args.env) # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = gym.wrappers.Monitor(env, args.outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if args.render: env = chainerrl.wrappers.Render(env) return env def make_batch_env(test): return chainerrl.envs.MultiprocessVectorEnv([ (lambda: make_env(idx, test)) for idx, env in enumerate(range(args.num_envs)) ]) # Only for getting timesteps, and obs-action spaces sample_env = gym.make(args.env) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space # Normalize observations based on their empirical mean and variance obs_normalizer = chainerrl.links.EmpiricalNormalization(obs_space.low.size, clip_threshold=5) # Switch policy types accordingly to action space types if args.arch == 'FFSoftmax': model = A3CFFSoftmax(obs_space.low.size, action_space.n) elif args.arch == 'FFMellowmax': model = A3CFFMellowmax(obs_space.low.size, action_space.n) elif args.arch == 'FFGaussian': model = A3CFFGaussian(obs_space.low.size, action_space, bound_mean=args.bound_mean) opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5) opt.setup(model) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = PPO( model, opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.ppo_update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages, ) if args.load: agent.load(args.load) # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.alpha = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) # Linearly decay the clipping parameter to zero def clip_eps_setter(env, agent, value): agent.clip_eps = value clip_eps_decay_hook = experiments.LinearInterpolationHook( args.steps, 0.2, 0, clip_eps_setter) experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(False), eval_env=make_batch_env(True), outdir=args.outdir, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, log_interval=args.log_interval, return_window_size=args.window_size, max_episode_len=timestep_limit, save_best_so_far_agent=False, step_hooks=[ lr_decay_hook, clip_eps_decay_hook, ], )
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('--gpu', type=int, default=0, help='GPU to use, set to -1 if no GPU.') parser.add_argument('--env', type=str, default='Hopper-v2', help='OpenAI Gym MuJoCo env to perform algorithm on.') parser.add_argument('--num-envs', type=int, default=1, help='Number of envs run in parallel.') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--steps', type=int, default=2 * 10 ** 6, help='Total number of timesteps to train the agent.') parser.add_argument('--eval-interval', type=int, default=100000, help='Interval in timesteps between evaluations.') parser.add_argument('--eval-n-runs', type=int, default=100, help='Number of episodes run for each evaluation.') parser.add_argument('--render', action='store_true', help='Render env states in a GUI window.') parser.add_argument('--demo', action='store_true', help='Just run evaluation, not training.') parser.add_argument('--load', type=str, default='', help='Directory to load agent from.') parser.add_argument('--logger-level', type=int, default=logging.INFO, help='Level of the root logger.') parser.add_argument('--monitor', action='store_true', help='Wrap env with gym.wrappers.Monitor.') parser.add_argument('--log-interval', type=int, default=1000, help='Interval in timesteps between outputting log' ' messages during training') parser.add_argument('--update-interval', type=int, default=2048, help='Interval in timesteps between model updates.') parser.add_argument('--epochs', type=int, default=10, help='Number of epochs to update model for per PPO' ' iteration.') parser.add_argument('--batch-size', type=int, default=64, help='Minibatch size') args = parser.parse_args() logging.basicConfig(level=args.logger_level) # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu,)) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2 ** 32 args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(process_idx, test): env = gym.make(args.env) # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) env_seed = 2 ** 32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = gym.wrappers.Monitor(env, args.outdir) if args.render: env = chainerrl.wrappers.Render(env) return env def make_batch_env(test): return chainerrl.envs.MultiprocessVectorEnv( [functools.partial(make_env, idx, test) for idx, env in enumerate(range(args.num_envs))]) # Only for getting timesteps, and obs-action spaces sample_env = gym.make(args.env) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space print('Observation space:', obs_space) print('Action space:', action_space) assert isinstance(action_space, gym.spaces.Box) # Normalize observations based on their empirical mean and variance obs_normalizer = chainerrl.links.EmpiricalNormalization( obs_space.low.size, clip_threshold=5) # While the original paper initialized weights by normal distribution, # we use orthogonal initialization as the latest openai/baselines does. winit = chainerrl.initializers.Orthogonal(1.) winit_last = chainerrl.initializers.Orthogonal(1e-2) action_size = action_space.low.size policy = chainer.Sequential( L.Linear(None, 64, initialW=winit), F.tanh, L.Linear(None, 64, initialW=winit), F.tanh, L.Linear(None, action_size, initialW=winit_last), chainerrl.policies.GaussianHeadWithStateIndependentCovariance( action_size=action_size, var_type='diagonal', var_func=lambda x: F.exp(2 * x), # Parameterize log std var_param_init=0, # log std = 0 => std = 1 ), ) vf = chainer.Sequential( L.Linear(None, 64, initialW=winit), F.tanh, L.Linear(None, 64, initialW=winit), F.tanh, L.Linear(None, 1, initialW=winit), ) # Combine a policy and a value function into a single model model = chainerrl.links.Branched(policy, vf) opt = chainer.optimizers.Adam(3e-4, eps=1e-5) opt.setup(model) agent = PPO( model, opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.update_interval, minibatch_size=args.batch_size, epochs=args.epochs, clip_eps_vf=None, entropy_coef=0, standardize_advantages=True, gamma=0.995, lambd=0.97, ) if args.load: agent.load(args.load) if args.demo: env = make_batch_env(True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(False), eval_env=make_batch_env(True), outdir=args.outdir, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, log_interval=args.log_interval, max_episode_len=timestep_limit, save_best_so_far_agent=False, )
chainerrl.distribution.SoftmaxDistribution, ), L.Linear(None, 1), )) opt = chainer.optimizers.Adam() opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(0.5)) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) agent = PPO(model, opt, phi=phi) # experiments.train_agent_with_evaluation( # agent=agent, # steps=2000, # env=env, # eval_n_steps=None, # eval_max_episode_len=100, # eval_n_episodes=5, # eval_interval=3, # outdir="test2" # ) # Set the discount factor that discounts future rewards. gamma = 0.95
def main(): # Prevent numpy from using multiple threads os.environ['OMP_NUM_THREADS'] = '1' import logging logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('rom', type=str) parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--outdir', type=str, default=None) parser.add_argument('--use-sdl', action='store_true') parser.add_argument('--max-episode-len', type=int, default=10000) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--lr', type=float, default=2.5e-4) parser.add_argument('--eval-interval', type=int, default=10**6) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--standardize-advantages', action='store_true') parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') # In the original paper, agent runs in 8 environments parallely # and samples 128 steps per environment. # Sample 128 * 8 steps, instead. parser.add_argument('--update-interval', type=int, default=128 * 8) parser.add_argument('--batchsize', type=int, default=32) parser.add_argument('--epochs', type=int, default=3) parser.set_defaults(use_sdl=False) args = parser.parse_args() # Set a random seed used in ChainerRL. misc.set_random_seed(args.seed, gpus=(args.gpu, )) args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) n_actions = ale.ALE(args.rom).number_of_actions model = A3CFF(n_actions) opt = chainer.optimizers.Adam(alpha=args.lr) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = PPO( model, opt, gpu=args.gpu, phi=dqn_phi, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps=0.1, clip_eps_vf=None, standardize_advantages=args.standardize_advantages, ) if args.load: agent.load(args.load) def make_env(test): # Use different random seeds for train and test envs env_seed = 2**31 - 1 - args.seed if test else args.seed env = ale.ALE(args.rom, use_sdl=args.use_sdl, treat_life_lost_as_terminal=not test, seed=env_seed) if not test: misc.env_modifiers.make_reward_clipped(env, -1, 1) return env if args.demo: env = make_env(True) eval_stats = experiments.eval_performance(env=env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev: {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.alpha = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) # Linearly decay the clipping parameter to zero def clip_eps_setter(env, agent, value): agent.clip_eps = value clip_eps_decay_hook = experiments.LinearInterpolationHook( args.steps, 0.1, 0, clip_eps_setter) experiments.train_agent_with_evaluation( agent=agent, env=make_env(False), eval_env=make_env(True), outdir=args.outdir, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=args.max_episode_len, step_hooks=[ lr_decay_hook, clip_eps_decay_hook, ], )
def main(args, train_env): logging.basicConfig(level=args.logger_level) # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu, )) if not (args.demo and args.load): args.outdir = experiments.prepare_output_dir(args, args.outdir) temp = args.outdir.split('/')[-1] dst = args.outdir.strip(temp) def make_env(test): env = gym.make(args.env) if test: episode_length = args.eval_episode_length else: episode_length = args.episode_length env.initialize_environment( case=args.state_rep, n_historical_events=args.n_historical_events, episode_length=episode_length, n_experts=args.n_experts, n_demos_per_expert=1, n_expert_time_steps=args.length_expert_TS, seed_agent=args.seed_agent, seed_expert=args.seed_expert, adam_days=args.adam_days) # Use different random seeds for train and test envs env_seed = 2**32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = gym.wrappers.Monitor(env, args.outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if args.render: env = chainerrl.wrappers.Render(env) return env sample_env = gym.make(args.env) sample_env.initialize_environment( case=args.state_rep, n_historical_events=args.n_historical_events, episode_length=args.episode_length, n_experts=args.n_experts, n_demos_per_expert=1, n_expert_time_steps=args.length_expert_TS, seed_agent=args.seed_agent, seed_expert=args.seed_expert, adam_days=args.adam_days) demonstrations = sample_env.generate_expert_trajectories(out_dir=dst, eval=False) timestep_limit = None #sample_env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps') # This value is None # Generate expert data for evaluation temp_env = gym.make(args.env) temp_env.initialize_environment( case=args.state_rep, n_historical_events=args.n_historical_events, episode_length= 0, # This parameter does not really matter since we create this env only for generating samples n_experts=args.n_experts, n_demos_per_expert=1, # We do not perform any clustering right now # n_demos_per_expert=args.n_demos_per_expert, # How large should the expert cluster be? n_expert_time_steps=args. eval_episode_length, # How long should each expert trajectory be? seed_expert=args.seed_expert, adam_days=args.adam_days) temp_env.generate_expert_trajectories(out_dir=dst, eval=True) obs_space = sample_env.observation_space action_space = sample_env.action_space # Normalize observations based on their empirical mean and variance if args.state_rep == 1: obs_dim = obs_space.low.size elif args.state_rep == 2 or args.state_rep == 21 or args.state_rep == 22 or args.state_rep == 24 or args.state_rep == 4 or args.state_rep == 221 or args.state_rep == 222 \ or args.state_rep == 71 or args.state_rep == 17 or args.state_rep == 81: obs_dim = obs_space.n elif args.state_rep == 3 or args.state_rep == 11 or args.state_rep == 23 or args.state_rep == 31 or args.state_rep == 7: obs_dim = obs_space.nvec.size else: raise NotImplementedError if args.normalize_obs: obs_normalizer = chainerrl.links.EmpiricalNormalization( obs_dim, clip_threshold=5) # shape: Shape of input values except batch axis else: obs_normalizer = None # Switch policy types accordingly to action space types if args.arch == 'FFSoftmax': model = A3CFFSoftmax(obs_dim, action_space.n, hidden_sizes=args.G_layers) elif args.arch == 'FFMellowmax': model = A3CFFMellowmax(obs_space.low.size, action_space.n) elif args.arch == 'FFGaussian': model = A3CFFGaussian(obs_space.low.size, action_space, bound_mean=args.bound_mean) opt = chainer.optimizers.Adam(alpha=args.lr, eps=10e-1) opt.setup(model) if args.show_D_dummy: # Let discriminator see dummy input_dim_D = obs_dim + 1 elif not args.show_D_dummy: # Do not let discriminator see dummy if args.state_rep == 21 or args.state_rep == 17: input_dim_D = obs_dim + 1 else: input_dim_D = obs_dim + 1 - args.n_experts if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) if args.algo == 'ppo': agent = PPO( model, opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages, ) elif args.algo == 'gail': from customer_behaviour.algorithms.irl.gail import GAIL as G from customer_behaviour.algorithms.irl.gail import Discriminator as D demonstrations = np.load(dst + '/expert_trajectories.npz') D = D(gpu=args.gpu, input_dim=input_dim_D, hidden_sizes=args.D_layers, loss_type=args.loss_type) agent = G(env=sample_env, demonstrations=demonstrations, discriminator=D, model=model, optimizer=opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages, args=args) elif args.algo == 'airl': from customer_behaviour.algorithms.irl.airl import AIRL as G from customer_behaviour.algorithms.irl.airl import Discriminator as D # obs_normalizer = None demonstrations = np.load(dst + '/expert_trajectories.npz') D = D(gpu=args.gpu, input_dim=input_dim_D - 1, hidden_sizes=args.D_layers) # AIRL only inputs state to D agent = G(env=sample_env, demonstrations=demonstrations, discriminator=D, model=model, optimizer=opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages, noise=args.noise, n_experts=args.n_experts, episode_length=args.episode_length, adam_days=args.adam_days, dummy_D=args.show_D_dummy) elif args.algo == 'mmct-gail': from customer_behaviour.algorithms.irl.gail.mmct_gail import MMCTGAIL as G from customer_behaviour.algorithms.irl.gail import Discriminator as D demonstrations = np.load(dst + '/expert_trajectories.npz') D = D(gpu=args.gpu, input_dim=input_dim_D, hidden_sizes=args.D_layers, loss_type=args.loss_type) agent = G(env=sample_env, demonstrations=demonstrations, discriminator=D, model=model, optimizer=opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages, args=args) if args.load: # By default, not in here agent.load(args.load) if args.demo: # By default, not in here env = make_env(True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) outdir = args.load if args.load else args.outdir save_agent_demo(make_env(False), agent, outdir) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.alpha = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) # Linearly decay the clipping parameter to zero def clip_eps_setter(env, agent, value): agent.clip_eps = max(value, 1e-8) clip_eps_decay_hook = experiments.LinearInterpolationHook( args.steps, 0.2, 0, clip_eps_setter) if train_env is None: experiments.train_agent_with_evaluation( agent=agent, env=make_env( False ), # Environment train the agent against (False -> scaled rewards) eval_env=make_env(True), # Environment used for evaluation outdir=args.outdir, steps=args. steps, # Total number of timesteps for training (args.n_training_episodes*args.episode_length) eval_n_steps= None, # Number of timesteps at each evaluation phase eval_n_episodes=args. eval_n_runs, # Number of episodes at each evaluation phase (default: 10) eval_interval=args. eval_interval, # Interval of evaluation (defualt: 10000 steps (?)) train_max_episode_len= timestep_limit, # Maximum episode length during training (is None) save_best_so_far_agent=False, step_hooks=[ lr_decay_hook, clip_eps_decay_hook, ], checkpoint_freq=args.eval_interval) else: experiments.train_agent_batch_with_evaluation( agent=agent, env=train_env, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, max_episode_len=timestep_limit, eval_max_episode_len=None, eval_env=make_env(True), step_hooks=[ lr_decay_hook, clip_eps_decay_hook, ], save_best_so_far_agent=False, checkpoint_freq=args.eval_interval, log_interval=args.update_interval) save_agent_demo( make_env(True), agent, args.outdir, 10 * args.eval_episode_length ) # originally it was make_env(test=False) which seems strange # Move result files to correct folder and remove empty folder move_dir(args.outdir, dst) os.rmdir(args.outdir) if args.save_results: print('Saving result...') res2.save_data(dst, 10000, 50, N=1) print('Running evaluate policy...') ep.eval_policy(a_dir_path=dst) # else: # if args.n_experts <= 10: # print('Running evaluate policy...') # ep.eval_policy(a_dir_path=dst) # # print('Running evaluate training...') # # ets.eval_training(a_dir_path=dst) # print('Done') if args.save_report_material: print('Saving dataframe...') if args.state_rep == 21: if args.algo == 'gail': folder_name = 'gail' elif args.algo == 'airl': folder_name = 'airl' elif args.state_rep == 22: if args.algo == 'gail': folder_name = 'gail_dummies' elif args.algo == 'airl': folder_name = 'airl_dummies' elif args.state_rep == 81: if args.algo == 'gail': folder_name = 'gail_adams' elif args.algo == 'airl': folder_name = 'airl_adams' elif args.state_rep == 17: folder_name = 'ail' elif args.state_rep == 221: folder_name = 'ail_dummies' elif args.state_rep == 71: folder_name = 'ail_adams' report_material.save_df(dst, folder_name) if args.save_folder is not None: print('Saving result to ' + args.save_folder) os.makedirs(os.path.join(os.getcwd(), args.save_folder), exist_ok=True) from distutils.dir_util import copy_tree copy_tree( os.path.join(os.getcwd(), dst), os.path.join(os.getcwd(), args.save_folder, args.outdir.split('/')[-2]))