def main(): """ Paperboy entry point - parse the arguments and run a command """ parser = argparse.ArgumentParser( description='Paperboy deep learning launcher') parser.add_argument('config', metavar='FILENAME', help='Configuration file for the run') parser.add_argument('command', metavar='COMMAND', help='A command to run') parser.add_argument('varargs', nargs='*', metavar='VARARGS', help='Extra options to the command') parser.add_argument('-r', '--run_number', type=int, default=0, help="A run number") parser.add_argument('-d', '--device', default='cuda', help="A device to run the model on") parser.add_argument('-s', '--seed', type=int, default=None, help="Random seed for the project") parser.add_argument('-p', '--param', type=str, metavar='NAME=VALUE', action='append', default=[], help="Configuration parameters") parser.add_argument('--reset', action='store_true', default=False, help="Overwrite existing model storage") args = parser.parse_args() model_config = ModelConfig.from_file( args.config, args.run_number, reset=args.reset, device=args.device, seed=args.seed, params={ k: v for (k, v) in (Parser.parse_equality(eq) for eq in args.param) }) # Set seed already in the launcher set_seed(model_config.seed) model_config.banner(args.command) model_config.run_command(args.command, args.varargs) model_config.quit_banner()
def pivoting_rl(args): device = torch.device('cuda:'+str(args.gpu) if torch.cuda.is_available() else 'cpu') seed = 1002 # Set random seed in python std lib, numpy and pytorch set_seed(seed) vec_env = DummyVecEnvWrapper( MujocoEnv('HalfCheetah-v2') ).instantiate(parallel_envs=1, seed=seed) if args.algo == 'ddpg': model, reinforcer = get_ddpg(vec_env, device) elif args.algo == 'ppo': model, reinforcer = get_ppo(vec_env, device) else: print('Unknown algo', args.algo); assert(False) # Optimizer helper - A weird regularization settings I've copied from OpenAI code adam_optimizer = AdamFactory( lr=[1.0e-4, 1.0e-3, 1.0e-3], weight_decay=[0.0, 0.0, 0.001], eps=1.0e-4, layer_groups=True ).instantiate(model) # Overall information store for training information training_info = TrainingInfo( metrics=[ EpisodeRewardMetric('episode_rewards'), # Calculate average reward from episode ], callbacks=[StdoutStreaming()] # Print live metrics every epoch to standard output ) # A bit of training initialization bookkeeping... training_info.initialize() reinforcer.initialize_training(training_info) training_info.on_train_begin() # Let's make 20 batches per epoch to average metrics nicely num_epochs = int(1.0e6 / 2 / 1000) # Normal handrolled training loop for i in range(1, num_epochs+1): epoch_info = EpochInfo( training_info=training_info, global_epoch_idx=i, batches_per_epoch=1000, optimizer=adam_optimizer ) reinforcer.train_epoch(epoch_info) training_info.on_train_end()
def half_cheetah_ddpg(): device = torch.device('cuda:0') seed = 1002 # Set random seed in python std lib, numpy and pytorch set_seed(seed) vec_env = DummyVecEnvWrapper(MujocoEnv('HalfCheetah-v2')).instantiate( parallel_envs=1, seed=seed) model_factory = DeterministicPolicyModelFactory( input_block=NormalizeObservationsFactory(input_shape=17), policy_backbone=MLPFactory(input_length=17, hidden_layers=[64, 64], activation='tanh'), value_backbone=MLPFactory(input_length=23, hidden_layers=[64, 64], activation='tanh'), ) model = model_factory.instantiate(action_space=vec_env.action_space) reinforcer = BufferedOffPolicyIterationReinforcer( device=device, environment=vec_env, settings=BufferedOffPolicyIterationReinforcerSettings( rollout_steps=2, training_steps=64, ), model=model, algo=DeepDeterministicPolicyGradient( model_factory=model_factory, discount_factor=0.99, tau=0.01, ), env_roller=TransitionReplayEnvRoller( environment=vec_env, device=device, action_noise=OuNoise(std_dev=0.2, environment=vec_env), replay_buffer=CircularReplayBuffer( buffer_capacity=1_000_000, buffer_initial_size=2_000, num_envs=vec_env.num_envs, observation_space=vec_env.observation_space, action_space=vec_env.action_space), normalize_returns=True, discount_factor=0.99), )
def half_cheetah_ddpg(): device = torch.device('cuda:0') seed = 1002 # Set random seed in python std lib, numpy and pytorch set_seed(seed) env = MujocoEnv('HalfCheetah-v2').instantiate(seed=seed) model_factory = DeterministicPolicyModelFactory( policy_backbone=MLPFactory(input_length=17, hidden_layers=[64, 64], activation='tanh'), value_backbone=MLPFactory(input_length=23, hidden_layers=[64, 64], activation='tanh'), ) model = model_factory.instantiate(action_space=env.action_space) reinforcer = BufferedSingleOffPolicyIterationReinforcer( device=device, settings=BufferedSingleOffPolicyIterationReinforcerSettings( batch_rollout_rounds=100, batch_training_rounds=50, batch_size=64, discount_factor=0.99 ), environment=env, model=model, algo=DeepDeterministicPolicyGradient( model_factory=model_factory, tau=0.01, ), env_roller=DequeReplayRollerOuNoise( environment=env, device=device, batch_size=64, buffer_capacity=1_000_000, buffer_initial_size=2_000, noise_std_dev=0.2, normalize_observations=True, normalize_returns=True, discount_factor=0.99 ) )
def eval_model(): """load a checkpoint data and evaluate its performance :return: None """ device = torch.device('cpu') seed = 1001 # Set random seed in python std lib, numpy and pytorch set_seed(seed) env_function = lambda: ColoredEgoCostmapRandomAisleTurnEnv() vec_env = DummyVecEnv([env_function]) vec_env.reset() model = PolicyGradientModelFactory(backbone=NatureCnnTwoTowerFactory( input_width=133, input_height=133, input_channels=1)).instantiate( action_space=vec_env.action_space) model_checkpoint = torch.load('tmp_checkout.data', map_location='cpu') model.load_state_dict(model_checkpoint) evaluate_model(model, vec_env, device, takes=10)
def train_model(): """a sample training script, that creates a PPO instance and train it with bc-gym environment :return: None """ device = torch.device('cpu') seed = 1001 # Set random seed in python std lib, numpy and pytorch set_seed(seed) env_function = lambda: ColoredEgoCostmapRandomAisleTurnEnv() vec_env = DummyVecEnv([env_function]) # Again, use a helper to create a model # But because model is owned by the reinforcer, model should not be accessed using this variable # but from reinforcer.model property model = PolicyGradientModelFactory(backbone=NatureCnnTwoTowerFactory( input_width=133, input_height=133, input_channels=1)).instantiate( action_space=vec_env.action_space) # Set schedule for gradient clipping. cliprange = LinearSchedule(initial_value=0.01, final_value=0.0) # Reinforcer - an object managing the learning process reinforcer = OnPolicyIterationReinforcer( device=device, settings=OnPolicyIterationReinforcerSettings(discount_factor=0.99, batch_size=256, experience_replay=4), model=model, algo=PpoPolicyGradient(entropy_coefficient=0.01, value_coefficient=0.5, max_grad_norm=0.01, cliprange=cliprange), env_roller=StepEnvRoller( environment=vec_env, device=device, gae_lambda=0.95, number_of_steps=128, discount_factor=0.99, )) # Model optimizer optimizer = optim.Adam(reinforcer.model.parameters(), lr=1e-6, eps=1.0e-5) # Overall information store for training information training_info = TrainingInfo( metrics=[ EpisodeRewardMetric( 'episode_rewards'), # Calculate average reward from episode ], callbacks=[ StdoutStreaming( ), # Print live metrics every epoch to standard output FrameTracker( 1.1e8 ) # We need frame tracker to track the progress of learning ]) # A bit of training initialization bookkeeping... training_info.initialize() reinforcer.initialize_training(training_info) training_info.on_train_begin() # Let's make 10 batches per epoch to average metrics nicely # Rollout size is 8 environments times 128 steps num_epochs = int(1.1e8 / (128 * 1) / 10) # Normal handrolled training loop eval_results = [] for i in range(1, num_epochs + 1): epoch_info = EpochInfo(training_info=training_info, global_epoch_idx=i, batches_per_epoch=10, optimizer=optimizer) reinforcer.train_epoch(epoch_info) eval_result = evaluate_model(model, vec_env, device, takes=1) eval_results.append(eval_result) if i % 100 == 0: torch.save(model.state_dict(), 'tmp_checkout.data') with open('tmp_eval_results.pkl', 'wb') as f: pickle.dump(eval_results, f, 0) training_info.on_train_end()
def breakout_a2c(): device = torch.device('cuda:0') seed = 1001 # Set random seed in python std lib, numpy and pytorch set_seed(seed) # Create 16 environments evaluated in parallel in sub processess with all usual DeepMind wrappers # These are just helper functions for that vec_env = SubprocVecEnvWrapper(ClassicAtariEnv('BreakoutNoFrameskip-v4'), frame_history=4).instantiate( parallel_envs=16, seed=seed) # Again, use a helper to create a model # But because model is owned by the reinforcer, model should not be accessed using this variable # but from reinforcer.model property model = PolicyGradientModelFactory(backbone=NatureCnnFactory( input_width=84, input_height=84, input_channels=4)).instantiate( action_space=vec_env.action_space) # Reinforcer - an object managing the learning process reinforcer = OnPolicyIterationReinforcer( device=device, settings=OnPolicyIterationReinforcerSettings( discount_factor=0.99, batch_size=256, ), model=model, algo=A2CPolicyGradient(entropy_coefficient=0.01, value_coefficient=0.5, max_grad_norm=0.5), env_roller=StepEnvRoller( environment=vec_env, device=device, number_of_steps=5, discount_factor=0.99, )) # Model optimizer optimizer = optim.RMSprop(reinforcer.model.parameters(), lr=7.0e-4, eps=1e-3) # Overall information store for training information training_info = TrainingInfo( metrics=[ EpisodeRewardMetric( 'episode_rewards'), # Calculate average reward from episode ], callbacks=[StdoutStreaming() ] # Print live metrics every epoch to standard output ) # A bit of training initialization bookkeeping... training_info.initialize() reinforcer.initialize_training(training_info) training_info.on_train_begin() # Let's make 100 batches per epoch to average metrics nicely num_epochs = int(1.1e7 / (5 * 16) / 100) # Normal handrolled training loop for i in range(1, num_epochs + 1): epoch_info = EpochInfo(training_info=training_info, global_epoch_idx=i, batches_per_epoch=100, optimizer=optimizer) reinforcer.train_epoch(epoch_info) training_info.on_train_end()
def main(): """ Paperboy entry point - parse the arguments and run a command """ parser = argparse.ArgumentParser( description='Paperboy deep learning launcher') parser.add_argument('config', metavar='FILENAME', help='Configuration file for the run') parser.add_argument('command', metavar='COMMAND', help='A command to run') parser.add_argument('varargs', nargs='*', metavar='VARARGS', help='Extra options to the command') parser.add_argument('-r', '--run_number', type=int, default=0, help="A run number") parser.add_argument('-d', '--device', default='cuda', help="A device to run the model on") parser.add_argument('-s', '--seed', type=int, default=None, help="Random seed for the project") parser.add_argument('-p', '--param', type=str, metavar='NAME=VALUE', action='append', default=[], help="Configuration parameters") parser.add_argument('--continue', action='store_true', default=False, help="Continue previously started learning process") parser.add_argument('--profile', type=str, default=None, help="Profiler output") args = parser.parse_args() model_config = ModelConfig.from_file( args.config, args.run_number, continue_training=getattr(args, 'continue'), device=args.device, seed=args.seed, params={ k: v for (k, v) in (Parser.parse_equality(eq) for eq in args.param) }) if model_config.project_dir not in sys.path: sys.path.append(model_config.project_dir) multiprocessing_setting = model_config.provide_with_default( 'multiprocessing', default=None) if multiprocessing_setting: # This needs to be called before any of PyTorch module is imported multiprocessing.set_start_method(multiprocessing_setting) # Set seed already in the launcher from vel.util.random import set_seed set_seed(model_config.seed) model_config.banner(args.command) if args.profile: print("[PROFILER] Running Vel in profiling mode, output filename={}". format(args.profile)) import cProfile import pstats profiler = cProfile.Profile() profiler.enable() model_config.run_command(args.command, args.varargs) profiler.disable() profiler.dump_stats(args.profile) profiler.print_stats(sort='tottime') print( "======================================================================" ) pstats.Stats(profiler).strip_dirs().sort_stats('tottime').print_stats( 30) print( "======================================================================" ) pstats.Stats(profiler).strip_dirs().sort_stats('cumtime').print_stats( 30) else: model_config.run_command(args.command, args.varargs) model_config.quit_banner()
def test_acer_breakout(): """ 1 iteration of ACER on breakout environment """ device = torch.device('cpu') seed = 1001 # Set random seed in python std lib, numpy and pytorch set_seed(seed) # Create 16 environments evaluated in parallel in sub processess with all usual DeepMind wrappers # These are just helper functions for that vec_env = SubprocVecEnvWrapper(ClassicAtariEnv('BreakoutNoFrameskip-v4'), frame_history=4).instantiate( parallel_envs=16, seed=seed) # Again, use a helper to create a model # But because model is owned by the reinforcer, model should not be accessed using this variable # but from reinforcer.model property model_factory = QPolicyGradientModelFactory(backbone=NatureCnnFactory( input_width=84, input_height=84, input_channels=4)) # Reinforcer - an object managing the learning process reinforcer = BufferedMixedPolicyIterationReinforcer( device=device, settings=BufferedMixedPolicyIterationReinforcerSettings( discount_factor=0.99, experience_replay=2, stochastic_experience_replay=False), model=model_factory.instantiate(action_space=vec_env.action_space), env=vec_env, algo=AcerPolicyGradient( model_factory=model_factory, entropy_coefficient=0.01, q_coefficient=0.5, rho_cap=10.0, retrace_rho_cap=1.0, trust_region=True, trust_region_delta=1.0, max_grad_norm=10.0, ), env_roller=ReplayQEnvRoller(environment=vec_env, device=device, number_of_steps=12, discount_factor=0.99, buffer_capacity=100, buffer_initial_size=100, frame_stack_compensation=4), ) # Model optimizer optimizer = optim.RMSprop(reinforcer.model.parameters(), lr=7.0e-4, eps=1e-3, alpha=0.99) # Overall information store for training information training_info = TrainingInfo( metrics=[ EpisodeRewardMetric( 'episode_rewards'), # Calculate average reward from episode ], callbacks=[] # Print live metrics every epoch to standard output ) # A bit of training initialization bookkeeping... training_info.initialize() reinforcer.initialize_training(training_info) training_info.on_train_begin() # Let's make 100 batches per epoch to average metrics nicely num_epochs = 1 # Normal handrolled training loop for i in range(1, num_epochs + 1): epoch_info = EpochInfo(training_info=training_info, global_epoch_idx=i, batches_per_epoch=1, optimizer=optimizer) reinforcer.train_epoch(epoch_info, interactive=False) training_info.on_train_end()
def test_trpo_bipedal_walker(): """ 1 iteration of TRPO on bipedal walker """ device = torch.device('cpu') seed = 1001 # Set random seed in python std lib, numpy and pytorch set_seed(seed) vec_env = DummyVecEnvWrapper(MujocoEnv('BipedalWalker-v2'), normalize=True).instantiate(parallel_envs=8, seed=seed) # Again, use a helper to create a model # But because model is owned by the reinforcer, model should not be accessed using this variable # but from reinforcer.model property model_factory = PolicyGradientModelSeparateFactory( policy_backbone=MLPFactory(input_length=24, hidden_layers=[32, 32]), value_backbone=MLPFactory(input_length=24, hidden_layers=[32])) # Reinforcer - an object managing the learning process reinforcer = OnPolicyIterationReinforcer( device=device, settings=OnPolicyIterationReinforcerSettings(discount_factor=0.99, ), model=model_factory.instantiate(action_space=vec_env.action_space), algo=TrpoPolicyGradient( max_kl=0.01, cg_iters=10, line_search_iters=10, improvement_acceptance_ratio=0.1, cg_damping=0.1, vf_iters=5, entropy_coef=0.0, max_grad_norm=0.5, ), env_roller=StepEnvRoller( environment=vec_env, device=device, number_of_steps=12, discount_factor=0.99, )) # Model optimizer optimizer = optim.Adam(reinforcer.model.parameters(), lr=1.0e-3, eps=1e-4) # Overall information store for training information training_info = TrainingInfo( metrics=[ EpisodeRewardMetric( 'episode_rewards'), # Calculate average reward from episode ], callbacks=[FrameTracker(100_000) ] # Print live metrics every epoch to standard output ) # A bit of training initialization bookkeeping... training_info.initialize() reinforcer.initialize_training(training_info) training_info.on_train_begin() # Let's make 100 batches per epoch to average metrics nicely num_epochs = 1 # Normal handrolled training loop for i in range(1, num_epochs + 1): epoch_info = EpochInfo(training_info=training_info, global_epoch_idx=i, batches_per_epoch=1, optimizer=optimizer) reinforcer.train_epoch(epoch_info, interactive=False) training_info.on_train_end()
def test_ddpg_bipedal_walker(): """ 1 iteration of DDPG bipedal walker environment """ device = torch.device('cpu') seed = 1001 # Set random seed in python std lib, numpy and pytorch set_seed(seed) # Only single environment for DDPG env = MujocoEnv('BipedalWalker-v2').instantiate(seed=seed) # Again, use a helper to create a model # But because model is owned by the reinforcer, model should not be accessed using this variable # but from reinforcer.model property model_factory = DeterministicPolicyModelFactory( policy_backbone=MLPFactory(input_length=24, hidden_layers=[64, 64], normalization='layer'), value_backbone=MLPFactory(input_length=28, hidden_layers=[64, 64], normalization='layer')) # Reinforcer - an object managing the learning process reinforcer = BufferedSingleOffPolicyIterationReinforcer( device=device, settings=BufferedSingleOffPolicyIterationReinforcerSettings( batch_rollout_rounds=4, batch_training_rounds=1, batch_size=32, discount_factor=0.99), environment=env, algo=DeepDeterministicPolicyGradient(model_factory=model_factory, tau=0.01, max_grad_norm=0.5), model=model_factory.instantiate(action_space=env.action_space), env_roller=DequeReplayRollerOuNoise(environment=env, device=device, batch_size=32, buffer_capacity=100, buffer_initial_size=100, noise_std_dev=0.2, normalize_observations=True, discount_factor=0.99)) # Model optimizer optimizer = optim.Adam(reinforcer.model.parameters(), lr=2.5e-4, eps=1e-4) # Overall information store for training information training_info = TrainingInfo( metrics=[ EpisodeRewardMetric( 'episode_rewards'), # Calculate average reward from episode ], callbacks=[FrameTracker(100_000) ] # Print live metrics every epoch to standard output ) # A bit of training initialization bookkeeping... training_info.initialize() reinforcer.initialize_training(training_info) training_info.on_train_begin() # Let's make 100 batches per epoch to average metrics nicely num_epochs = 1 # Normal handrolled training loop for i in range(1, num_epochs + 1): epoch_info = EpochInfo(training_info=training_info, global_epoch_idx=i, batches_per_epoch=1, optimizer=optimizer) reinforcer.train_epoch(epoch_info, interactive=False) training_info.on_train_end()
def test_prioritized_dqn_breakout(): """ Simple 1 iteration of DQN prioritized replay breakout """ device = torch.device('cpu') seed = 1001 # Set random seed in python std lib, numpy and pytorch set_seed(seed) # Only single environment for DQN env = ClassicAtariEnv('BreakoutNoFrameskip-v4').instantiate(seed=seed) # Again, use a helper to create a model # But because model is owned by the reinforcer, model should not be accessed using this variable # but from reinforcer.model property model_factory = QModelFactory(backbone=NatureCnnFactory( input_width=84, input_height=84, input_channels=4)) # Reinforcer - an object managing the learning process reinforcer = BufferedSingleOffPolicyIterationReinforcer( device=device, settings=BufferedSingleOffPolicyIterationReinforcerSettings( batch_rollout_rounds=4, batch_training_rounds=1, batch_size=32, discount_factor=0.99), environment=env, algo=DeepQLearning(model_factory=model_factory, double_dqn=False, target_update_frequency=10_000, max_grad_norm=0.5), model=model_factory.instantiate(action_space=env.action_space), env_roller=PrioritizedReplayRollerEpsGreedy( environment=env, device=device, epsilon_schedule=LinearAndConstantSchedule( initial_value=1.0, final_value=0.1, end_of_interpolation=0.1), batch_size=8, buffer_capacity=100, priority_epsilon=1.0e-6, buffer_initial_size=100, frame_stack=4, priority_exponent=0.6, priority_weight=LinearSchedule(initial_value=0.4, final_value=1.0), ), ) # Model optimizer optimizer = optim.RMSprop(reinforcer.model.parameters(), lr=2.5e-4, alpha=0.95, momentum=0.95, eps=1e-3) # Overall information store for training information training_info = TrainingInfo( metrics=[ EpisodeRewardMetric( 'episode_rewards'), # Calculate average reward from episode ], callbacks=[FrameTracker(100_000) ] # Print live metrics every epoch to standard output ) # A bit of training initialization bookkeeping... training_info.initialize() reinforcer.initialize_training(training_info) training_info.on_train_begin() # Let's make 100 batches per epoch to average metrics nicely num_epochs = 1 # Normal handrolled training loop for i in range(1, num_epochs + 1): epoch_info = EpochInfo(training_info=training_info, global_epoch_idx=i, batches_per_epoch=1, optimizer=optimizer) reinforcer.train_epoch(epoch_info, interactive=False) training_info.on_train_end()
def qbert_ppo(): device = torch.device('cuda:0') seed = 1001 # Set random seed in python std lib, numpy and pytorch set_seed(seed) # Create 16 environments evaluated in parallel in sub processess with all usual DeepMind wrappers # These are just helper functions for that vec_env = SubprocVecEnvWrapper(ClassicAtariEnv('QbertNoFrameskip-v4'), frame_history=4).instantiate( parallel_envs=8, seed=seed) # Again, use a helper to create a model # But because model is owned by the reinforcer, model should not be accessed using this variable # but from reinforcer.model property model = StochasticPolicyModelFactory( input_block=ImageToTensorFactory(), backbone=NatureCnnFactory( input_width=84, input_height=84, input_channels=4)).instantiate(action_space=vec_env.action_space) # Set schedule for gradient clipping. cliprange = LinearSchedule(initial_value=0.1, final_value=0.0) # Reinforcer - an object managing the learning process reinforcer = OnPolicyIterationReinforcer( device=device, settings=OnPolicyIterationReinforcerSettings(batch_size=256, experience_replay=4, number_of_steps=128), model=model, algo=PpoPolicyGradient(entropy_coefficient=0.01, value_coefficient=0.5, max_grad_norm=0.5, discount_factor=0.99, gae_lambda=0.95, cliprange=cliprange), env_roller=StepEnvRoller( environment=vec_env, device=device, )) # Model optimizer optimizer = optim.Adam(reinforcer.model.parameters(), lr=2.5e-4, eps=1.0e-5) # Overall information store for training information training_info = TrainingInfo( metrics=[ EpisodeRewardMetric( 'episode_rewards'), # Calculate average reward from episode ], callbacks=[ StdoutStreaming( ), # Print live metrics every epoch to standard output FrameTracker( 1.1e7 ) # We need frame tracker to track the progress of learning ]) # A bit of training initialization bookkeeping... training_info.initialize() reinforcer.initialize_training(training_info) training_info.on_train_begin() # Let's make 10 batches per epoch to average metrics nicely # Rollout size is 8 environments times 128 steps num_epochs = int(1.1e7 / (128 * 8) / 10) # Normal handrolled training loop for i in range(1, num_epochs + 1): epoch_info = EpochInfo(training_info=training_info, global_epoch_idx=i, batches_per_epoch=10, optimizer=optimizer) reinforcer.train_epoch(epoch_info) training_info.on_train_end()