def train(): """ Train PPO1 model for slime volleyball, in MPI multiprocessing. Tested for 96 CPUs. """ rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure(folder=LOGDIR) else: logger.configure(format_strs=[]) workerseed = SEED + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = make_env(workerseed) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) env.seed(workerseed) model = PPO1.load(BEST_MODEL_PATH, env=env) eval_callback = EvalCallback(env, best_model_save_path=LOGDIR, log_path=LOGDIR, eval_freq=EVAL_FREQ, n_eval_episodes=EVAL_EPISODES) model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback) env.close() del env if rank == 0: model.save(os.path.join( LOGDIR, "final_model")) # probably never get to this point.
def train(self, tensorboard_log: str) -> None: try: self.load_model(tensorboard_log=tensorboard_log) except: self.create_model(tensorboard_log=tensorboard_log) # Stop training if reward gets close to zero callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-0.1, verbose=1) eval_callback = EvalCallback(self.env, callback_on_new_best=callback_on_best, verbose=1) # Save model at regular time intervals checkpoint_callback = CheckpointCallback( save_freq=1000, save_path='./model_checkpoints/') # Chain callbacks together callback = CallbackList([eval_callback, checkpoint_callback]) # Train model self.model.learn(total_timesteps=int(1e10), callback=callback, tb_log_name="run") # Save trained model print("Training is finished!")
def run(model_name, iteration, world, stage): world_stage = 'SuperMarioBros-{}-{}-v2'.format(world, stage) env = gym_super_mario_bros.make(world_stage) env = JoypadSpace(env, RIGHT_ONLY) env = WarpFrame(env) env = FrameStack(env, n_frames=4) env = EpisodicLifeEnv(env) # env = MaxAndSkipEnv(env) # Save a checkpoint every 1000 steps checkpoint_callback = CheckpointCallback(save_freq=5000, save_path='./logs/', name_prefix=model_name) eval_callback = EvalCallback(env, best_model_save_path='./logs/', log_path='./logs/', eval_freq=10000, deterministic=True, render=False) print("Compiling model...") steps = 10000 if iteration > 0: model = DQN.load('models/{}'.format(model_name), env=env, verbose=1, learning_starts=2500, learning_rate=1e-4, exploration_final_eps=0.01, prioritized_replay=True, prioritized_replay_alpha=0.6, train_freq=4, tensorboard_log="./mario_tensorboard/") else: model = DQN(CnnPolicy, env, verbose=1, learning_starts=2500, learning_rate=1e-4, exploration_final_eps=0.01, prioritized_replay=True, prioritized_replay_alpha=0.6, train_freq=4, tensorboard_log="./mario_tensorboard/") print("Training starting...") with ProgressBarManager(steps) as progress_callback: model.learn( total_timesteps=steps, # , eval_callback, checkpoint_callback], callback=[progress_callback], tb_log_name=model_name) print("Finished training model on env...\n") model.save("models/{}".format(model_name))
def test_callbacks(model_class): env_id = 'Pendulum-v0' if model_class in [ACER, DQN]: env_id = 'CartPole-v1' allowed_failures = [] # Number of training timesteps is too short # otherwise, the training would take too long, or would require # custom parameter per algorithm if model_class in [PPO1, DQN, TRPO]: allowed_failures = ['rollout_end'] # Create RL model model = model_class('MlpPolicy', env_id) checkpoint_callback = CheckpointCallback(save_freq=500, save_path=LOG_FOLDER) # For testing: use the same training env eval_env = model.get_env() # Stop training if the performance is good enough callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-1200, verbose=1) eval_callback = EvalCallback(eval_env, callback_on_new_best=callback_on_best, best_model_save_path=LOG_FOLDER, log_path=LOG_FOLDER, eval_freq=100) # Equivalent to the `checkpoint_callback` # but here in an event-driven manner checkpoint_on_event = CheckpointCallback(save_freq=1, save_path=LOG_FOLDER, name_prefix='event') event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event) callback = CallbackList([checkpoint_callback, eval_callback, event_callback]) model.learn(500, callback=callback) model.learn(200, callback=None) custom_callback = CustomCallback() model.learn(200, callback=custom_callback) # Check that every called were executed custom_callback.validate(allowed_failures=allowed_failures) # Transform callback into a callback list automatically custom_callback = CustomCallback() model.learn(500, callback=[checkpoint_callback, eval_callback, custom_callback]) # Check that every called were executed custom_callback.validate(allowed_failures=allowed_failures) # Automatic wrapping, old way of doing callbacks model.learn(200, callback=lambda _locals, _globals: True) # Cleanup if os.path.exists(LOG_FOLDER): shutil.rmtree(LOG_FOLDER)
def run(): # folders log_dir = "./logs/" models_path = "./trained_models/" best_model_save_path = models_path + "ppo_sokoban_model" # hyperparameters gamma = 0.99 #Discount factor ent_coef = 0.01 #Entropy coefficient for the loss calculation n_envs = 4 # number of environments n_steps = 20 # The number of steps to run for each environment per update (i.e. batch size is n_steps * n_env where n_env is number of environment copies running in parallel) learning_rate = 0.00025 #The learning rate, it can be a function vf_coef = 0.5 #Value function coefficient for the loss calculation max_grad_norm = 0.5 #The maximum value for the gradient clipping lam = 0.95 #Factor for trade-off of bias vs variance for Generalized Advantage Estimator timesteps = 100 verbose = 1 n_measurements = 10 # number of measurements for the graph eval_callback_freq = 20#timesteps / n_measurements # interval between callbacks to achieve desired n_measurements # multiprocess environment env = make_vec_env('Boxoban-Train-v1', n_envs=n_envs) first_env = env.envs[0] first_env = Monitor(first_env, log_dir) eval_callback = EvalCallback(first_env, best_model_save_path=best_model_save_path, log_path=log_dir, eval_freq=eval_callback_freq, deterministic=True, render=False) model = PPO2(MlpPolicy, env, gamma=gamma, ent_coef=ent_coef, n_steps=n_steps, learning_rate=learning_rate, vf_coef=vf_coef, max_grad_norm=max_grad_norm, lam=lam, verbose=1) model.learn(total_timesteps=timesteps, callback=eval_callback) #model.save("trained_models/ppo2_sokoban_model") # save model to disk # Enjoy trained agent obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) if rewards[0] > 10: print("Completed the puzzle") time.sleep(0.1) env.render("human")
def init_env(env_id): if parallel: env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)]) reward_env = SubprocVecEnv([make_env(env_id, i) for i in range(1)]) else: env = DummyVecEnv([make_env(env_id, i) for i in range(num_cpu)]) reward_env = DummyVecEnv([make_env(env_id, i) for i in range(1)]) if terminate_early: callback_on_best = StopTrainingOnRewardThreshold( reward_threshold=0.85, verbose=verbose) eval_callback = EvalCallback(reward_env, callback_on_new_best=callback_on_best, eval_freq=10_000, verbose=verbose) return env, reward_env, eval_callback else: return env, reward_env, None
def train(self): # Load latest model if available try: path = os.getcwd() os.chdir(os.getcwd() + '/model_checkpoints') files = [x for x in os.listdir() if x.endswith(".zip")] num = [] for file in files: num.append([int(x) for x in file.split('_') if x.isdigit()][0]) filename = "rl_model_" + str(max(num)) + "_steps.zip" print("Tentative: " + filename) self.model = PPO2.load(load_path=filename, env=DummyVecEnv([lambda: self.env]), tensorboard_log='./a2c_rasp_tensorboard/') print("Successfully loaded the previous model: " + filename) os.chdir(path) except: # Vector-encode our new environment env = DummyVecEnv([lambda: self.env]) # Create new model self.model = PPO2('MlpPolicy', env, verbose=1, tensorboard_log='./a2c_rasp_tensorboard/') print("Successfully created new model") # Stop training if reward get close to zero callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-1e-2, verbose=1) eval_callback = EvalCallback(self.env, callback_on_new_best=callback_on_best, verbose=1) # Save model at regular time intervals checkpoint_callback = CheckpointCallback(save_freq=2000, save_path='./model_checkpoints/') # Chain callbacks together callback = CallbackList([eval_callback, checkpoint_callback]) # Train model episode = 1 while episode < 10: # Update location of red dot _ = self.env.square if self.env.trainable: print("Beginning episode number {}".format(episode)) self.model.learn(total_timesteps=int(1e10), callback=callback, tb_log_name="run") episode += 1 # Save trained model self.model.save("raspberry_agent")
def train_models(env, vecenv): seeds = [1, 2, 3] for seed in seeds: algos = [{'name': 'a2c', 'model': a2c(vecenv, seed)}, {'name': 'acktr', 'model': acktr(vecenv, seed)}, {'name': 'ddpg', 'model': ddpg(env, seed)}, {'name': 'ppo', 'model': ppo(vecenv, seed)}, {'name': 'sac', 'model': sac(env, seed)}, {'name': 'td3', 'model': td3(env, seed)}, {'name': 'trpo', 'model': trpo(env, seed)}] for a in algos: cb = EarlyStopCallback(reward_threshold=5000, verbose=1) early_stop = EvalCallback(env, callback_on_new_best=cb, verbose=1) a['model'].learn(total_timesteps=int(1e10), callback=early_stop) a['model'].save(f'data/models/{a["name"]}_{seed}') tf.reset_default_graph()
def create_eval_callback(env_id, save_dir='./logs', eval_freq=1000, n_eval_episodes=10): """ :param env_id: environment id :param save_dir: the directory to save the best model :param eval_freq: the frequency of the evaluation callback :param n_eval_episodes: the number of evaluation of each callback :return: EvalCallback for training """ eval_env = gym.make(env_id) eval_callback = EvalCallback(eval_env, best_model_save_path=save_dir, log_path=save_dir, eval_freq=eval_freq, n_eval_episodes=n_eval_episodes, deterministic=False, render=False) return eval_callback
def create_callbacks(self, eval_env: VecEnv) -> List[BaseCallback]: callbacks: List[BaseCallback] = list() if self.use_eval_callback: model_path: str = os.path.join('non_learning_io_logs', self.model_name, "") eval_callback = EvalCallback(eval_env, best_model_save_path=model_path, log_path=model_path, eval_freq=2 ** 13, verbose=0, n_eval_episodes=32, deterministic=True, render=False) callbacks.append(eval_callback) if self.verbose: callbacks.append( PbarCallback( tqdm(desc="Training Steps Progress", total=self.total_train_steps, file=sys.stdout), num_envs=self.n_envs ) ) return callbacks
def build_eval_callback( self, eval_freq=10000, reward_threshold=900, log_path=None, eval_episodes=10, eval_env=None, ): callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=reward_threshold, verbose=1) eval_callback = EvalCallback( eval_env=eval_env, best_model_save_path=log_path, log_path=log_path, eval_freq=eval_freq, deterministic=True, render=False, n_eval_episodes=eval_episodes, callback_on_new_best=callback_on_best, verbose=1, ) self.logger.debug( "Eval callback called every {} timesteps: stop training when mean reward is above {} in {} episodes".format( eval_freq, reward_threshold, eval_episodes ) ) return eval_callback
def test_recurrent_eval_callback(): env_id = 'Pendulum-v0' # Create envs env = make_vec_env(env_id, n_envs=4) eval_env = make_vec_env(env_id, n_envs=1) # Create RL model model = PPO2('MlpLstmPolicy', env) # Stop training if the performance is good enough callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-1200, verbose=1) eval_callback = EvalCallback(eval_env, callback_on_new_best=callback_on_best, best_model_save_path=LOG_FOLDER, log_path=LOG_FOLDER, eval_freq=100) model.learn(500, callback=eval_callback) # Cleanup if os.path.exists(LOG_FOLDER): shutil.rmtree(LOG_FOLDER)
def learn(env_name, seed, load_path, save_path, tensorboard_log, total_timesteps, n_cpu): save_path = env_name if save_path is None else save_path checkpoint_callback = CheckpointCallback(save_freq=2000, save_path=save_path) eval_env = make_env(env_name, n_cpu, seed)() eval_callback = EvalCallback(eval_env, best_model_save_path=save_path+'/best', log_path=tensorboard_log, eval_freq=1000) callback = CallbackList([checkpoint_callback, eval_callback]) policy = CnnPolicy # policy = CnnLstmPolicy # policy = CnnLnLstmPolicy print(env_name, policy) # Run this to enable SubprocVecEnv on Mac OS X. # export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES # see https://github.com/rtomayko/shotgun/issues/69#issuecomment-338401331 env = SubprocVecEnv([make_env(env_name, i, seed) for i in range(n_cpu)]) if load_path is not None: model = PPO2.load(load_path, env, verbose=1, tensorboard_log=tensorboard_log) else: model = PPO2(policy, env, verbose=1, tensorboard_log=tensorboard_log) model.learn(total_timesteps=total_timesteps, log_interval=5, callback=callback) print('saving model:', save_path+'/latest_model') model.save(save_path+'/latest_model') env.close()
def train(agent=None): weights = {'fr': 0.3, 'fl': 20, 'fk': 20} depth, width, move_dist, plan_dist = 3, 3, 3, 3 mkenv = lambda: Env(depth, width, move_dist, plan_dist, max_steps=20, weights=weights, obstacle_pct=0.1) eval_callback = EvalCallback(mkenv(), best_model_save_path='logs/models', log_path='logs', eval_freq=1_000, deterministic=True, render=False) vecenv = make_vec_env(mkenv, 32, monitor_dir='logs/training') if agent: agent.set_env(vecenv) else: hparams = dict(n_steps=64, nminibatches=64, gamma=0.90, learning_rate=2e-5, ent_coef=0.01, cliprange=0.4, noptepochs=25, lam=0.99) agent = PPO2('MlpPolicy', vecenv, verbose=True, **hparams) agent.learn(1_000_000, callback=eval_callback) agent.save('logs/models/final') vecenv.close() return agent
type=int) parser.add_argument('--save-freq', help='Save the model every n steps (if negative, no checkpoint)', default=-1, type=int) args = parser.parse_args() env_id = args.env n_timesteps = args.n_timesteps save_path = '{}_{}'.format(args.algo, env_id) # Instantiate and wrap the environment env = TimeFeatureWrapper(gym.make(env_id)) # Create the evaluation environment and callbacks eval_env = DummyVecEnv([lambda: TimeFeatureWrapper(gym.make(env_id))]) callbacks = [EvalCallback(eval_env, best_model_save_path=save_path)] # Save a checkpoint every n steps if args.save_freq > 0: callbacks.append(CheckpointCallback(save_freq=args.save_freq, save_path=save_path, name_prefix='rl_model')) algo = { 'sac': SAC, 'td3': TD3 }[args.algo] n_actions = env.action_space.shape[0] # Tuned hyperparameters from https://github.com/araffin/rl-baselines-zoo hyperparams = {
def train_initial_policy( model_name, algo=ALGO, env_name=ENV_NAME, time_steps=TIME_STEPS): """Uses the specified algorithm on the target environment""" print("Using algorithm : ", algo.__name__) print("Model saved as : ", "data/models/" +algo.__name__+"_initial_policy_"+env_name+"_.pkl") # define the environment here env = gym.make(env_name) env.seed(SEED) if NOISE_VALUE>0 : env = NoisyRealEnv(env, noise_value=NOISE_VALUE) if MUJOCO_NORMALIZE: env = MujocoNormalized(env) print('~~ ENV Obs RANGE : ', env.observation_space.low, env.observation_space.high) print('~~~ ENV Action RANGE : ', env.action_space.low, env.action_space.high) if algo.__name__ == "ACKTR": print('Using SubprovVecEnv') env = SubprocVecEnv([lambda: env for i in range(8)]) elif algo.__name__ == "SAC": print('Using standard gym environment') env = env else: print('Using Dummy Vec Env') env = DummyVecEnv([lambda : env]) if NORMALIZE : env = VecNormalize(env, training=True, norm_obs=True, norm_reward=False, clip_reward=1e6, ) with open('data/target_policy_params.yaml') as file: args = yaml.load(file, Loader=yaml.FullLoader) args = args[algo.__name__][PARAMS_ENV] print('~~ Loaded args file ~~') if algo.__name__ == "SAC": print('Initializing SAC with RLBaselinesZoo hyperparameters .. ') print('using 256 node architecture as in the paper') class CustomPolicy(ffp_sac): def __init__(self, *args, **kwargs): super(CustomPolicy, self).__init__(*args, **kwargs, feature_extraction="mlp", layers=[256, 256]) model = SAC(CustomPolicy, env, verbose=1, tensorboard_log='data/TBlogs/initial_policy_training', batch_size=args['batch_size'], buffer_size=args['buffer_size'], ent_coef=args['ent_coef'], learning_starts=args['learning_starts'], learning_rate=args['learning_rate'], train_freq=args['train_freq'], seed=SEED, ) elif algo.__name__ == "TD3": print('Initializing TD3 with RLBaselinesZoo hyperparameters .. ') # hyperparameters suggestions from : # https://github.com/araffin/rl-baselines-zoo/blob/master/trained_agents/td3/HopperBulletEnv-v0/config.yml n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=float(args['noise_std']) * np.ones(n_actions)) class CustomPolicy2(ffp_td3): def __init__(self, *args, **kwargs): super(CustomPolicy2, self).__init__(*args, **kwargs, feature_extraction="mlp", layers=[400, 300]) model = TD3(CustomPolicy2, env, verbose = 1, tensorboard_log = 'data/TBlogs/initial_policy_training', batch_size = args['batch_size'], buffer_size = args['buffer_size'], gamma = args['gamma'], gradient_steps = args['gradient_steps'], learning_rate = args['learning_rate'], learning_starts = args['learning_starts'], action_noise = action_noise, train_freq=args['train_freq'], seed=SEED, ) elif algo.__name__ == "TRPO": print('Initializing TRPO with RLBaselinesZoo hyperparameters .. ') # hyperparameters suggestions from : # https://github.com/araffin/rl-baselines-zoo/blob/master/trained_agents/sac/HopperBulletEnv-v0/config.yml model = TRPO(mlp_standard, env, verbose=1, tensorboard_log='data/TBlogs/initial_policy_training', timesteps_per_batch=args['timesteps_per_batch'], lam=args['lam'], max_kl=args['max_kl'], gamma=args['gamma'], vf_iters=args['vf_iters'], vf_stepsize=args['vf_stepsize'], entcoeff=args['entcoeff'], cg_damping=args['cg_damping'], cg_iters=args['cg_iters'], seed=SEED, ) elif algo.__name__ == "ACKTR": print('Initializing ACKTR') model = ACKTR(mlp_standard, env, verbose=1, n_steps=128, ent_coef=0.01, lr_schedule='constant', learning_rate=0.0217, max_grad_norm=0.5, gamma=0.99, vf_coef=0.946, seed=SEED) elif algo.__name__ == "PPO2": print('Initializing PPO2') print('Num envs : ', env.num_envs) model = PPO2(mlp_standard, env, n_steps=int(args['n_steps']/env.num_envs), nminibatches=args['nminibatches'], lam=args['lam'], gamma=args['gamma'], ent_coef=args['ent_coef'], noptepochs=args['noptepochs'], learning_rate=args['learning_rate'], cliprange=args['cliprange'], verbose=1, tensorboard_log='data/TBlogs/initial_policy_training', seed=SEED, ) else: print('No algorithm matched. Using SAC .. ') model = SAC(CustomPolicy, env, verbose=1, batch_size=args['batch_size'], buffer_size=args['buffer_size'], ent_coef=args['ent_coef'], learning_starts=args['learning_starts'], learning_rate=args['learning_rate'], train_freq=args['train_freq'], seed=SEED, ) # change model name if using normalization if NORMALIZE: model_name = model_name.replace('.pkl', 'normalized_.pkl') elif MUJOCO_NORMALIZE: model_name = model_name.replace('.pkl', 'mujoco_norm_.pkl') if SAVE_BEST_FOR_20: model.learn(total_timesteps=time_steps, tb_log_name=model_name, log_interval=10, callback=eval_callback) save_the_model() model_name = model_name.replace('best_', '') model.save(model_name) elif SAVE_INTERMEDIATE: check_callback = CheckpointCallback(save_freq=SAVE_FREQ, save_path=model_name[:-4], name_prefix=ENV_NAME + '_' + str(SEED), verbose=1, ) eval_env = DummyVecEnv([lambda: gym.make(ENV_NAME)]) eval_env.seed(SEED) eval_callback = EvalCallback(eval_env, n_eval_episodes=10, eval_freq=SAVE_FREQ, log_path=model_name[:-4], deterministic=False, render=False, verbose=1) callbacks = CallbackList([check_callback, eval_callback]) model.learn(total_timesteps=time_steps, tb_log_name=model_name.split('/')[-1], log_interval=10, callback=callbacks) model.save(model_name) npzfile = np.load(model_name[:-4] + '/evaluations.npz') average_rewards = np.mean(npzfile['results'], axis=1)[:, 0] with open(model_name[:-4] + "/eval_results.txt", "a") as f: for i in range(np.shape(average_rewards)[0]): f.write("{}, {}\n".format(npzfile['timesteps'][i], average_rewards[i])) evaluate_policy_on_env(env, model, render=False, iters=50) else: model.learn(total_timesteps=time_steps, tb_log_name=model_name.split('/')[-1], log_interval=10,) model.save(model_name) evaluate_policy_on_env(env, model, render=False, iters=50) # save the environment params if NORMALIZE: # env.save(model_name.replace('.pkl', 'stats_.pkl')) env.save('data/models/env_stats/'+env_name+'.pkl') print('done :: ', model_name) exit()
# Separate evaluation env if SAVE_BEST_FOR_20: eval_env = DummyVecEnv([lambda : gym.make(ENV_NAME)]) if NORMALIZE: eval_env = VecNormalize(eval_env, training=True, norm_obs=True, norm_reward=False, clip_reward=1e6, ) eval_callback = EvalCallback(eval_env, best_model_save_path=model_name[:-4], n_eval_episodes=30, eval_freq=5000, deterministic=True, render=False, verbose=1) def save_the_model(): shutil.move(model_name[:-4]+'/best_model.zip', model_name) try: os.rmdir(model_name[:-4]) print('Successfully saved the model.') except Exception as e: print(e) def evaluate_policy_on_env(env, model,
action='store', default="gait2d_td3.h5f") args = parser.parse_args() # set to get observation in array #def _new_step(self, action, project=True, obs_as_dict=False): # return super(Arm2DEnv, self).step(action, project=project, obs_as_dict=obs_as_dict) #Arm2DEnv.step = _new_step # Load walking environment env = Gait2DGenAct(args.visualize, integrator_accuracy=3e-2) eval_env = Gait2DGenAct(integrator_accuracy=3e-2) #env = Arm2DVecEnv(visualize=True) callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=1000, verbose=1) eval_callback = EvalCallback(eval_env, callback_on_new_best=callback_on_best, verbose=1) n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.1) * np.ones(n_actions), theta=0.05) param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.287) class CustomTD3Policy(FeedForwardPolicy): def __init__(self, *args, **kwargs): super(CustomTD3Policy, self).__init__(*args,
return _init if __name__ == '__main__': num_cpu = 15 # Number of processes to use # Create the vectorized environment env = SubprocVecEnv([make_env(x,y,z, i) for i in range(num_cpu)]) eval_env=environment(x, y, z, gamma, cutoffpenaltyscalar, rg_prob, turnspc, savepath) # Stable Baselines provides you with make_vec_env() helper # which does exactly the previous steps for you: # env = make_vec_env(env_id, n_envs=num_cpu, seed=0) #create callbacks to record data, initiate events during training. callbacklist=CallbackList([TimeLimit(episodetimesteps), EvalCallback(eval_env, log_path=savepath, n_eval_episodes=5 , deterministic=False, best_model_save_path=savepath)]) #create model with Stable Baselines package. model = A2C(CnnPolicy, env, gamma=gamma, n_steps=updatesteps, learning_rate=LR, verbose=1)#, tensorboard_log=scenario) model.learn(total_timesteps=episodetimesteps**50, callback=callbacklist) #total timesteps set to very large number so program will terminate based on runtime parameter) #create learning curve plot evaluations= './%s/%s/evaluations.npz' % (storagefolder,scenario) data=np.load(evaluations) results=data['results'] y=np.average(results, axis=1) timesteps=data['timesteps'] plt.plot(timesteps,y)
def main(): """ Prepare for trainings """ log_dir, model_dir = prepare_dirs() model_name = model_dir + '/' + MODEL_NAME print(f'model will be saved as {model_name}') log_dir = log_dir + '/' + MODEL_NAME """ Generate & Check environment """ env_name = ENV_NAME env = gym.make(env_name) # print(f'Observation space: {env.observation_space}') # print(f'Action space: {env.action_space}') # env = Monitor(env, log_dir, allow_early_resets=True) # check_env(env) """ Save config as pickle file """ config = summarize_config(env) save_config(log_dir, config) """ Vectorize environment """ #num_envs = NUM_ENVS #env = DummyVecEnv([lambda: env for _ in range(num_envs)]) # For training #eval_env = DummyVecEnv([lambda: gym.make(env_name)]) # For evaluation eval_env = gym.make(env_name) """ Define checkpoint callback """ checkpoint_callback = CheckpointCallback(save_freq=SAVE_FREQ, save_path=model_name, name_prefix=MODEL_NAME) """ Use deterministic actions for evaluation callback """ eval_callback = EvalCallback(eval_env, best_model_save_path=model_name, log_path=log_dir, eval_freq=EVAL_FREQ, deterministic=True, render=False, n_eval_episodes=N_EVAL_EPISODES) print(f'Algorithm: {ALGORITHM}\n') if not CONTINUAL_LEARNING: """ Define model """ model = define_model(env, log_dir) else: model = load_model(env, model_dir, log_dir) """ Evaluate model before training """ # mean_reward, std_reward = evaluate_policy(model=model, # env=eval_env, # n_eval_episodes=N_EVAL_EPISODES) # print(f'Before training: mean reward: {mean_reward:.2f} +/- {std_reward:.2f}') """ Train model """ model.learn(total_timesteps=MAX_STEPS, callback=[checkpoint_callback, eval_callback]) """ Evaluate model after training """ # mean_reward, std_reward = evaluate_policy(model=model, # env=eval_env, # n_eval_episodes=N_EVAL_EPISODES) # print(f'After training: mean reward: {mean_reward:.2f} +/- {std_reward:.2f}') """ Save trained model """ model.save(model_name) """ Test trained model """ obs = eval_env.reset() for i in range(N_EVAL_EPISODES): action, _states = model.predict(obs) obs, rewards, dones, info = eval_env.step(action) eval_env.render() env.close() eval_env.close()
if __name__ == '__main__': num_cpu = ncpu # Number of processes to use # Create the vectorized environment env = SubprocVecEnv([make_env(x,y,z, i) for i in range(num_cpu)]) eval_env=evalenv(x, y, z, gamma, turnspc, policyname) env1 =environment(x, y, z, gamma, turnspc, penaltyscalar, policyname) #env annealreate/ numturns*eval_freq # Stable Baselines provides you with make_vec_env() helper # which does exactly the previous steps for you: # env = make_vec_env(env_id, n_envs=num_cpu, seed=0) #create callbacks to record data, initiate events during training. callbacklist=CallbackList([TimeLimit(episodetimesteps), EvalCallback(eval_env, log_path=evpath, n_eval_episodes=100, eval_freq=50000 , deterministic=False, best_model_save_path=evpath), EvalCallback(env1, log_path=savepath, n_eval_episodes=20, eval_freq=50000 , deterministic=False, best_model_save_path=savepath)]) if (os.path.exists("%s/best_model.zip" % savepath)): # Instantiate the agent model = A2C(policy, env, gamma=gamma, n_steps=episodetimesteps, learning_rate=LR, verbose=1, n_cpu_tf_sess=num_cpu) # Load the trained agent model = A2C.load("%s/best_model" % savepath, env=env) print('loaded agent') save_evals() model.learn(total_timesteps=episodetimesteps**50, callback=callbacklist) #total timesteps set to very large number so program will terminate based on runtime parameter) else: #create model with Stable Baselines package. model = A2C(policy, env, gamma=gamma, n_steps=episodetimesteps, learning_rate=LR, verbose=1, n_cpu_tf_sess=num_cpu)#, tensorboard_log=scenario)
:param rank: (int) index of the subprocess """ def _init(): env = environment(x,y,z, gamma) env.seed(seed + rank) return env set_global_seeds(seed) return _init #points_values=list([[0,LR1],[1000000,LR2]]) #Sched=PiecewiseSchedule(points_values, outside_value=LR2) if __name__ == '__main__': num_cpu = 1 # Number of processes to use # Create the vectorized environment env = SubprocVecEnv([make_env(x,y,z, i) for i in range(num_cpu)]) eval_env=environment(x,y,z, gamma) # Stable Baselines provides you with make_vec_env() helper # which does exactly the previous steps for you: # env = make_vec_env(env_id, n_envs=num_cpu, seed=0) scenario=str(f'RG_t{test}_lr{LR}_gamma{gamma}_batch{batch_size}') callbacklist=CallbackList([TimeLimit(episodetimesteps), EvalCallback(eval_env, log_path=scenario, deterministic=False)]) model = A2C(CnnPolicy, env, gamma=gamma, verbose=1)#, tensorboard_log=scenario) model.learn(total_timesteps=episodetimesteps**99, callback=callbacklist)
from stable_baselines import PPO2 from stable_baselines.common.policies import CnnPolicy from stable_baselines.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold from BeautifulBlueSquare.BlueGymEnv import simpleAvoidance # Separate evaluation env eval_env = simpleAvoidance() # Stop training when the model reaches the reward threshold, 800 * .9 = 720 callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=720, verbose=1) # Create call back that will eval model and save the best one and stop training once reward has reached 490 eval_callback = EvalCallback(eval_env, n_eval_episodes=20, eval_freq=int(800 * 50), callback_on_new_best=callback_on_best, best_model_save_path="model", log_path="model", verbose=1) # Almost infinite number of timesteps, but the training will stop # early as soon as the reward threshold is reached env = simpleAvoidance() model = PPO2(CnnPolicy, env, gamma=.99, n_steps=256) model.learn(total_timesteps=int(20e6), callback=eval_callback)
'ledHSVHigher': np.array([31, 9, 255]), 'rPiIP': '192.168.0.183', 'rPiPort': 50000, 'episodeLength': 100, 'bullseye': 10 } env = make_vec_env(RPiLEDEnv, n_envs=1, env_kwargs=envArgsDict) callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-500, verbose=1) eval_callback = EvalCallback(env, best_model_save_path='./logs/best', log_path='./logs/', eval_freq=500, deterministic=True, render=False, callback_on_new_best=callback_on_best) # Added checkpoint because I lost model data after a crash when the webcam shutdown because the screen went to sleep :( checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/', name_prefix='ppo2_model') cb = CallbackList([checkpoint_callback, eval_callback]) policy_kwargs = {'layers': [128, 128]} model = PPO2(MlpPolicy, env,
import gym_env from stable_baselines.common.callbacks import CallbackList, CheckpointCallback, EvalCallback checkpoint_callback = CheckpointCallback(save_freq=10000, save_path='./tf_model_logs/') # Separate evaluation env eval_env = gym_env.PegInEnv( "PandaPegIn", has_offscreen_renderer=True, # has_renderer=True, use_camera_obs=False, control_freq=100, ) eval_callback = EvalCallback(eval_env, best_model_save_path='./tf_model_logs/best_model', log_path='./tf_model_logs/best_model_results', eval_freq=10000) # Create the callback list callback = CallbackList([checkpoint_callback, eval_callback]) env = gym_env.PegInEnv( "PandaPegIn", has_offscreen_renderer=True, # has_renderer=True, use_camera_obs=False, control_freq=100, ) model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=2048,
# env.render() delta_returns.append(env.get_attr('final_reward')[0]) print("naked:", naked_returns) print("covered:", covered_returns) print("rl:", rl_returns) print("delta:", delta_returns) else: # load data df_train, df_test, df_rate = load_data(cfg) env = DummyVecEnv([lambda: HedgeEnv(df_train, df_rate, cfg)]) T = env.get_attr('T')[0] checkpoint_callback = CheckpointCallback(save_freq=cfg.timestep / 10, save_path=cfg.model_dir) eval_callback = EvalCallback(env, best_model_save_path=cfg.model_dir, log_path=cfg.log_dir, eval_freq=cfg.timestep / 10, deterministic=True, render=False) model = DDPG(MlpPolicy, env, verbose=1) model.learn(total_timesteps=cfg.timestep, callback=[checkpoint_callback, eval_callback]) cfg_log.dump(cfg.cfg_file) obs = env.reset() for i in range(T): action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) env.render()
log_dir = f"full/{algo}-{policy}-{tag}" logger.configure(folder=log_dir) env = gym.make("SlimeVolley-v0") env.atari_mode = True env.survival_bonus = True env.__init__() env.seed(seed) eval_env = gym.make("SlimeVolley-v0") eval_env.atari_mode = True eval_env.__init__() eval_env.seed(seed) eval_callback = EvalCallback(eval_env, best_model_save_path=log_dir, log_path=log_dir, eval_freq=eval_freq, n_eval_episodes=eval_episodes) print(f"Beginning training for {algo}-{policy}-{tag}.") params = { 'policy': policyFn, 'train_env': env, 'eval_env': eval_env, 'timesteps': timesteps, 'eval_callback': eval_callback, } opt_params = { 'a2c': { 'gamma': [0.900, 0.999], 'vf_coef': [0.10, 0.40],
logger.configure(folder=LOGDIR) env = gym.make("SlimeVolley-v0") env = Monitor(env, LOGDIR, allow_early_resets=True) env.seed(n) model = PPO1(BnnPolicy, env, timesteps_per_actorbatch=4096, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=2) eval_callback = EvalCallback(env, best_model_save_path=LOGDIR, log_path=LOGDIR, eval_freq=EVAL_FREQ, n_eval_episodes=EVAL_EPISODES) model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback) model.save(os.path.join(LOGDIR, "final_model")) env.close()
num_cpu = ncpu # Number of processes to use # Create the vectorized environment env = SubprocVecEnv([make_env(x, y, z, i) for i in range(num_cpu)]) eval_env = evalenv(x, y, z, turnspc, policyname) env1 = environment(x, y, z, turnspc, scalar, policyname) # Stable Baselines provides you with make_vec_env() helper # which does exactly the previous steps for you: # env = make_vec_env(env_id, n_envs=num_cpu, seed=0) #create callbacks to record data, initiate events during training. callbacklist = CallbackList([ TimeLimit(episodetimesteps), EvalCallback(eval_env, log_path=evpath, n_eval_episodes=100, eval_freq=50000, deterministic=True, best_model_save_path=evpath), EvalCallback(env1, log_path=savepath, n_eval_episodes=20, eval_freq=10000, deterministic=False, best_model_save_path=savepath) ]) if (os.path.exists("%s/final_model.zip" % savepath)): # Instantiate the agent model = ACER(policy, env, gamma=gamma, n_steps=episodetimesteps,
print('done') a_dim = env.action_space.shape[0] # td3_noise = OrnsteinUhlenbeckActionNoise(np.zeros(a_dim), .9*np.ones(a_dim)) td3_noise = NormalActionNoise(0,SIGMA) td3_env = DummyVecEnv([lambda: env]) # td3_env = env checkpoint_on_event = CheckpointCallback(save_freq=1000, save_path= "./logs/model_checkpoints", name_prefix='rl_model') event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event) eval_callback = EvalCallback(td3_env, best_model_save_path='./logs/', log_path='./logs/', eval_freq=100, deterministic=True, render=False) # td3_model.learning_starts = 100 custom_callback = customCallback(verbose=0) callback = CallbackList([custom_callback, checkpoint_on_event]) td3_model = TD3(Td3MlpPolicy, td3_env, gamma = GAMMA, learning_rate = LEARNING_RATE, buffer_size = BUFFER_SIZE, learning_starts = LEARNING_STARTS, train_freq = TRAIN_FREQ,