Ejemplo n.º 1
0
import gym
gym.logger.set_level(40)
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

from env import GoLeftEnv
from stable_baselines import DQN, PPO2, A2C, ACKTR
from stable_baselines.common.cmd_util import make_vec_env
from stable_baselines.common.evaluation import evaluate_policy
from stable_baselines.common.vec_env import VecVideoRecorder, DummyVecEnv
from stable_baselines.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

env = GoLeftEnv(grid_size=10)
env = make_vec_env(lambda: env, n_envs=1)

callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=0.9,
                                                 verbose=1)
eval_callback = EvalCallback(env,
                             callback_on_new_best=callback_on_best,
                             verbose=1)

model = ACKTR('MlpPolicy', env, verbose=1)
model.learn(int(1e10), callback=eval_callback)

mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

model.save('models/best')

env.close()
Ejemplo n.º 2
0
    env = create_env(n_envs)
    # Create test env if needed, do not normalize reward
    eval_env = None
    if args.eval_freq > 0 and not args.optimize_hyperparameters:
        # Account for the number of parallel environments
        args.eval_freq = max(args.eval_freq // n_envs, 1)

        if args.verbose > 0:
            print("Creating test environment")

        save_vec_normalize = SaveVecNormalizeCallback(save_freq=1,
                                                      save_path=params_path)
        eval_callback = EvalCallback(create_env(1, eval_env=True),
                                     callback_on_new_best=save_vec_normalize,
                                     best_model_save_path=save_path,
                                     n_eval_episodes=args.eval_episodes,
                                     log_path=save_path,
                                     eval_freq=args.eval_freq)
        callbacks.append(eval_callback)

    # TODO: check for hyperparameters optimization
    # TODO: check What happens with the eval env when using frame stack
    if 'frame_stack' in hyperparams:
        del hyperparams['frame_stack']

    # Stop env processes to free memory
    if args.optimize_hyperparameters and n_envs > 1:
        env.close()

    # Parse noise string for DDPG and SAC
    if algo_ in ['ddpg', 'sac', 'td3'
Ejemplo n.º 3
0
        help='Save the model every n steps (if negative, no checkpoint)',
        default=-1,
        type=int)
    args = parser.parse_args()

    env_id = args.env
    n_timesteps = args.n_timesteps
    save_path = '{}_{}'.format(args.algo, env_id)

    # Instantiate and wrap the environment
    env = TimeFeatureWrapper(gym.make(env_id))

    # Create the evaluation environment and callbacks
    eval_env = DummyVecEnv([lambda: TimeFeatureWrapper(gym.make(env_id))])

    callbacks = [EvalCallback(eval_env, best_model_save_path=save_path)]

    # Save a checkpoint every n steps
    if args.save_freq > 0:
        callbacks.append(
            CheckpointCallback(save_freq=args.save_freq,
                               save_path=save_path,
                               name_prefix='rl_model'))

    algo = {'sac': SAC, 'td3': TD3}[args.algo]

    n_actions = env.action_space.shape[0]

    # Tuned hyperparameters from https://github.com/araffin/rl-baselines-zoo
    hyperparams = {
        'sac':
Ejemplo n.º 4
0
from stable_baselines.common.vec_env import VecNormalize
import numpy as np

# multiprocess environment
env = make_vec_env(LearningRocket, n_envs=16)
eval_env = make_vec_env(lambda: LearningRocket(visualize=True), n_envs=1)
#env = VecNormalize(env)
#eval_env = VecNormalize(eval_env)

env = VecNormalize.load("doof_env", env)
eval_env = VecNormalize.load("doof_env", eval_env)

eval_callback = EvalCallback(eval_env,
                             best_model_save_path='Agent007',
                             log_path='./logs/',
                             eval_freq=10000,
                             deterministic=True,
                             render=False,
                             n_eval_episodes=1)

#model = PPO2(MlpPolicy, env, n_steps=1000, nminibatches=32, lam=0.98, gamma=0.999, learning_rate=1e-4,
#                                  noptepochs=4,ent_coef=0.01,verbose=1, tensorboard_log="./rocket_tensorboard/",
#                                  policy_kwargs = dict(layers=[400, 300]))
"""model = PPO2(MlpPolicy, env,verbose=1, tensorboard_log="./rocket_tensorboard/",
                                  policy_kwargs = dict(layers=[400, 300]))"""

#model = PPO2.load("doof", env=env, tensorboard_log="./rocket_tensorboard/")
#model.learning_rate = 2.5e-4
#model.n_steps = 100
#while True:
#    model.learn(total_timesteps=5000000,callback=eval_callback   )
Ejemplo n.º 5
0
    # load environment with config variables
    env_obj = getattr(rl.environments, args.environment)
    env = env_obj(config)

    # multiprocess environment
    env_8 = create_env(args.environment, config=config, n_workers=n_workers)

    # callback for evaluation
    callback_on_best = StopTrainingOnRewardThreshold(
        reward_threshold=max_reward, verbose=1)
    eval_callback = EvalCallback(env,
                                 callback_on_new_best=callback_on_best,
                                 best_model_save_path=specified_path,
                                 log_path=specified_path,
                                 eval_freq=10000,
                                 n_eval_episodes=5,
                                 verbose=1,
                                 deterministic=True,
                                 render=False)

    # train model
    try:
        try:
            model_path = join(specified_path, 'best_model.zip')
            model = PPO2.load(model_path,
                              env=env_8,
                              tensorboard_log=specified_path)
            # model = PPO2('MlpPolicy', env=env_8, tensorboard_log=specified_path, **model_config).load(args.modelpath, env=env_8)
            print("model loaded")
    #Create the evaluation environment
    if eval_environment == True:
        eps_schedule_eval = [eps_schedule[-1]]
        eval_env = DummyVecEnv([make_env(env_id=env_name, rank=0, seed=100, \
            impulsive=impulsive, action_coord=action_coord, obs_type=obs_type, \
            random_obs=random_obs, stochastic=stochastic, mission_type=mission_type, \
            NSTEPS=NSTEPS, NITER=niter_per_cpu/nminibatches, \
            eps_schedule=eps_schedule_eval, lambda_con=lambda_con, \
            Tmax=Tmax, ueq=ueq, tf=tf, amu=1., m0=m0, \
            r0=r0, v0=v0, \
            rTf=rTf, vTf=vTf, \
            sigma_r=sigma_r, sigma_v=sigma_v, \
            sigma_u_rot=sigma_u_rot, sigma_u_norm=sigma_u_norm, \
            MTE=MTE, pr_MTE=pr_MTE)])
        eval_callback = EvalCallback(eval_env, n_eval_episodes=100, \
                                best_model_save_path=out_folder, \
                                log_path=out_folder, eval_freq=40000, \
                                deterministic=True)

    # Create the model
    if algorithm == "PPO":
        if load_model == False:
            model = PPO2(policy=policy,
                         env=env,
                         n_steps=n_steps,
                         nminibatches=nminibatches,
                         gamma=gamma,
                         ent_coef=ent_coef,
                         cliprange_vf=-1,
                         lam=lam,
                         noptepochs=noptepochs,
                         learning_rate=learning_rate,
Ejemplo n.º 7
0
def DRL() -> None:
    ### PREPARATION
    # callback for validation
    eval_callback = EvalCallback(val_env,
                                 best_model_save_path=config.val_path,
                                 log_path=config.val_path,
                                 eval_freq=config.val_freq,
                                 deterministic=config.deterministic,
                                 n_eval_episodes=config.val_eps)

    ### SETUP AND TRAIN
    # Setup model
    if config.MODEL_NAME == "A2C":
        model = A2C(config.POLICY,
                    train_env,
                    verbose=1,
                    tensorboard_log=config.tb_path,
                    seed=config.seed)
    elif config.MODEL_NAME == "PPO":
        model = PPO2(config.POLICY,
                     train_env,
                     verbose=1,
                     tensorboard_log=config.tb_path,
                     nminibatches=1,
                     seed=config.seed)
    elif config.MODEL_NAME == "DDPG":
        # the noise objects for DDPG
        n_actions = train_env.action_space.shape[-1]
        param_noise = None
        action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                    sigma=float(0.5) *
                                                    np.ones(n_actions))
        model = DDPG(config.POLICY,
                     train_env,
                     param_noise=param_noise,
                     action_noise=action_noise,
                     verbose=1,
                     tensorboard_log=config.tb_path,
                     seed=config.seed)
        print("DDPG does not provice training output...")

    ###
    # Train Model
    model = model.learn(total_timesteps=config.learn_steps,
                        callback=eval_callback)

    # Load best model after training
    if config.MODEL_NAME == "A2C":
        model = A2C.load(load_path=config.val_path.joinpath("best_model.zip"))
    elif config.MODEL_NAME == "PPO":
        model = PPO2.load(load_path=config.val_path.joinpath("best_model.zip"))
    elif config.MODEL_NAME == "DDPG":
        model = DDPG.load(load_path=config.val_path.joinpath("best_model.zip"))

    ### EVAL MODEL
    # Make prediction in test_env
    test_mean, test_std = evaluate_policy(model=model,
                                          env=test_env,
                                          deterministic=config.deterministic,
                                          n_eval_episodes=config.test_eps,
                                          return_episode_rewards=False)

    print(f"Test Mean:{test_mean}\n"+ \
          f"Test Std:{test_std}")

if __name__ == '__main__':

    num_cpu = 12  # Number of processes to use
    # Create the vectorized environment
    env = SubprocVecEnv([make_env(inputfile, i) for i in range(num_cpu)])
    eval_env = environment(inputfile, gamma)
    # Stable Baselines provides you with make_vec_env() helper
    # which does exactly the previous steps for you:
    # env = make_vec_env(env_id, n_envs=num_cpu, seed=0)
    scenario = str(
        f'{inputfile_s}_t{test}_lr{LR_s}_gamma{gamma_s}_batch{batch_size}')
    callbacklist = CallbackList([
        TimeLimit(episodetimesteps),
        EvalCallback(eval_env, log_path=scenario, n_eval_episodes=5)
    ])

    model = PPO2(MlpPolicy,
                 env,
                 gamma=gamma,
                 n_steps=batch_size,
                 learning_rate=LR,
                 verbose=1)  #, tensorboard_log=scenario)
    model.learn(total_timesteps=episodetimesteps**99, callback=callbacklist)

    filename = './%s/evaluations.npz' % scenario

    data = np.load(filename)
    results = data['results']
    y = np.average(results, axis=1)
Ejemplo n.º 9
0
    def __init__(self, algorithm="SAC", load=True, agent_name="Agent001"):
        self.agent_name = agent_name

        #self.env = LearningRocket(visualize=False)
        #self.env = NormalizeActionWrapper(self.env)

        #self.eval_env = LearningRocket(visualize=True)
        #self.eval_env = NormalizeActionWrapper(self.eval_env)

        #self.env = SubprocVecEnv([lambda: LearningRocket(visualize=False) for i in range(4)])
        self.env = make_vec_env(
            LearningRocket, n_envs=16
        )  #[lambda: LearningRocket(visualize=False) for i in range(16)]))
        #self.eval_env = VecNormalize(DummyVecEnv([lambda: LearningRocket(visualize=True) for i in range(1)]))
        self.eval_env = make_vec_env(lambda: LearningRocket(visualize=True),
                                     n_envs=1)
        #self.eval_env = VecNormalize(self.eval_env)
        self.eval_callback = EvalCallback(self.eval_env,
                                          best_model_save_path='Agent007',
                                          log_path='./logs/',
                                          eval_freq=10000,
                                          deterministic=True,
                                          render=False,
                                          n_eval_episodes=1)
        kai_policy = dict(act_fun=tf.nn.tanh, net_arch=[400, 300])
        #check_env(self.env, warn=True)
        """
        if algorithm == "SAC":
            if load is True:
                self.model = SAC.load(agent_name, env=self.env, tensorboard_log="./rocket_tensorboard/")
                #self.model.ent_coef=0.2
            else:
                self.model = SAC('MlpPolicy', self.env, verbose=1, tensorboard_log="./rocket_tensorboard/",ent_coef=5)
            print("Trainer Set for SAC")
        """
        if algorithm == "TD3":
            n_actions = self.env.action_space.shape[-1]
            action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                             sigma=0.1 * np.ones(n_actions))
            if load is True:
                self.model = TD3.load(agent_name,
                                      env=self.env,
                                      tensorboard_log="./rocket_tensorboard/")
                #file = open('replay_buffer', 'rb')
                #self.model.replay_buffer = pickle.load(file)
                #file.close()
            else:
                self.model = TD3(MlpPolicy,
                                 self.env,
                                 action_noise=action_noise,
                                 batch_size=768,
                                 gamma=0.95,
                                 learning_rate=1e-4,
                                 learning_starts=20000,
                                 verbose=1,
                                 tensorboard_log="./rocket_tensorboard/",
                                 policy_kwargs=dict(layers=[400, 300]))
            print("Trainer Set for TD3")
        elif algorithm == "PPO2":
            if load is True:
                self.model = PPO2.load(agent_name,
                                       env=self.env,
                                       tensorboard_log="./rocket_tensorboard/")
                self.eval_env = VecNormalize.load(self.agent_name + "vEnv",
                                                  self.eval_env)
                #self.eval_env.clip_obs = 500
                #self.env = VecNormalize(self.env)
                self.env = VecNormalize.load(self.agent_name + "vEnv",
                                             self.env)
                #self.env.clip_obs = 500
                #self.env.norm_obs = False
                #self.eval_env.norm_obs = False
            else:
                self.model = PPO2(PPOMlpPolicy,
                                  self.env,
                                  n_steps=1024,
                                  nminibatches=32,
                                  lam=0.98,
                                  gamma=0.999,
                                  noptepochs=4,
                                  ent_coef=0.01,
                                  verbose=1,
                                  tensorboard_log="./rocket_tensorboard/",
                                  policy_kwargs=dict(layers=[400, 300]))
                self.eval_env = VecNormalize(self.eval_env)
                self.env = VecNormalize(self.env)
                #self.eval_env.clip_obs = 500
                #self.env.clip_obs = 500
                #self.env.norm_obs=False
                #self.eval_env.norm_obs=False

                print("Trainer set for PPO2. I am speed.")
Ejemplo n.º 10
0
def RocketTrainer():
    env = SubprocVecEnv([
        make_env(LearningRocket,
                 'E:\Tobi\LearningRocket\TestHover\LearningRocketHover.py', i)
        for i in range(72)
    ])

    eval_env = make_vec_env(lambda: LearningRocket(visualize=True), n_envs=1)

    eval_callback = EvalCallback(eval_env,
                                 best_model_save_path='Agent007',
                                 log_path='./logs/',
                                 eval_freq=10000,
                                 deterministic=True,
                                 render=False,
                                 n_eval_episodes=1)

    model = PPO2(MlpPolicy,
                 env,
                 n_steps=1500,
                 nminibatches=144,
                 lam=0.98,
                 gamma=0.999,
                 learning_rate=5e-4,
                 cliprange=0.3,
                 noptepochs=4,
                 ent_coef=0.01,
                 verbose=1,
                 tensorboard_log="./rocket_tensorboard/",
                 policy_kwargs=dict(layers=[400, 300]))

    start = t.time()

    #model = PPO2.load("TestHover", env=env, tensorboard_log="./rocket_tensorboard/")
    model.learn(total_timesteps=10000000, callback=eval_callback)
    model.save("TestHover")
    del model  # remove to demonstrate saving and loading

    duration = t.time() - start

    model = PPO2.load("TestHover", env=eval_env)

    # Enjoy trained agent
    obs = eval_env.reset()
    data = []
    time = []
    actions = []
    alt_reward = []
    mix_reward = []
    temp_reward = []
    valveChange = []
    speedPunishes = []
    total_reward = []
    alt_cumu = []
    mix_cumu = []
    temp_cumu = []
    total_cumu = []
    start = True
    modifiers = [1000, 1000, 200, 1, 200, 2000, 10, 1000, 1500, 1]

    for i in range(10):
        data.append([])
    for i in range(3):
        actions.append([])
    lastValves = [0.15, 0.2, 0.15]

    for i in range(600):
        action, _states = model.predict(obs, deterministic=True)
        obs, rewards, dones, info = eval_env.step(action)
        #Or_obs = eval_env.get_original_obs()

        time.append(i)
        for j in range(10):
            data[j].append(obs[0][j] * modifiers[j])
        data[2][i] -= 100
        for j in range(3):
            actions[j].append(action[0][j])
        offset = abs(data[0][i] - data[1][i])
        #if offset < 10:
        #    alt_reward.append(1-offset/10)
        #else:
        alt_reward.append((offset / 2) / 1000)

        mixError = abs(data[6][i] - 5.5)
        mix_reward.append((mixError / 0.2) / 1000)
        #if mixError > 0.3:
        #    mix_reward[i] += 1

        tempError = abs(data[5][i] - 900)
        temp_reward.append((tempError / 30) / 1000)
        #if tempError > 50:
        #    temp_reward[i] += 1

        total_reward.append(alt_reward[i] + mix_reward[i] + temp_reward[i])

        if start is True:
            alt_cumu.append(alt_reward[i])
            mix_cumu.append(mix_reward[i])
            temp_cumu.append(temp_reward[i])
            total_cumu.append(total_reward[i])
            start = False
        else:
            alt_cumu.append(alt_reward[i] + alt_cumu[i - 1])
            mix_cumu.append(mix_reward[i] + mix_cumu[i - 1])
            temp_cumu.append(temp_reward[i] + temp_cumu[i - 1])
            total_cumu.append(total_reward[i] + total_cumu[i - 1])

    plt.figure(figsize=(11, 8))
    plt.subplot(4, 2, 1)
    plt.xlabel('Time(s)')
    plt.ylabel('Offset (m)')
    plt.plot(time, data[0], label='Z Position')
    plt.plot(time, data[1], label='Z Speed')

    plt.subplot(4, 2, 2)
    plt.xlabel('Time(s)')
    plt.ylabel('Actions')

    plt.plot(time, actions[0], 'b', label='LOX Command')
    plt.plot(time, actions[1], 'r', label='LH2 Command')
    plt.plot(time, actions[2], 'y', label='Mix Command')
    plt.legend(loc='best')

    plt.subplot(4, 2, 3)
    plt.xlabel('Time(s)')
    plt.ylabel('Engine State')
    plt.plot(time, data[5], label='Temp')
    plt.legend(loc='best')

    plt.subplot(4, 2, 5)
    plt.xlabel('Time(s)')
    plt.ylabel('Engine State')
    plt.plot(time, data[4], label='Pressure')
    plt.legend(loc='best')

    plt.subplot(4, 2, 4)
    plt.xlabel('Time(s)')
    plt.ylabel('Mixture')
    plt.plot(time, data[6], label='Mixture')
    plt.legend(loc='best')

    plt.subplot(4, 2, 6)
    plt.xlabel('Time(s)')
    plt.ylabel('Reward values. Valve Error REAL valves')
    plt.plot(time, alt_reward, label='Altitude Error')
    plt.plot(time, mix_reward, label='Mixture Error')
    plt.plot(time, temp_reward, label='Temperature Error')
    plt.plot(time, total_reward, label='Total Reward')

    plt.subplot(4, 2, 8)
    plt.xlabel('Time(s)')
    plt.ylabel('Reward values cumulative')
    plt.plot(time, alt_cumu, label='Altitude Error')
    plt.plot(time, mix_cumu, label='Mixture Error')
    plt.plot(time, temp_cumu, label='Temperature Error')
    plt.plot(time, total_cumu, label='Total Reward')

    plt.subplot(4, 2, 7)
    plt.xlabel('Time(s)')
    plt.ylabel('Thrust in kN')
    plt.plot(time, data[7])

    plt.legend(loc='best')

    print(duration)
    plt.show()
Ejemplo n.º 11
0
                # env.render()
            delta_returns.append(env.get_attr('final_reward')[0])
        print("naked:", naked_returns)
        print("covered:", covered_returns)
        print("rl:", rl_returns)
        print("delta:", delta_returns)

    else:
        # load data
        df_train, df_test, df_rate = load_data(cfg)
        env = DummyVecEnv([lambda: HedgeEnv(df_train, df_rate, cfg)])
        T = env.get_attr('T')[0]
        checkpoint_callback = CheckpointCallback(save_freq=cfg.timestep / 10,
                                                 save_path=cfg.model_dir)
        eval_callback = EvalCallback(env,
                                     best_model_save_path=cfg.model_dir,
                                     log_path=cfg.log_dir,
                                     eval_freq=cfg.timestep / 10,
                                     deterministic=True,
                                     render=False)
        model = TD3(MlpPolicy, env, verbose=1)
        model.learn(total_timesteps=cfg.timestep,
                    callback=[checkpoint_callback, eval_callback])
        cfg_log.dump(cfg.cfg_file)

        obs = env.reset()
        for i in range(T):
            action, _states = model.predict(obs)
            obs, rewards, done, info = env.step(action)
            env.render()
Ejemplo n.º 12
0
    n_workers = config['main']['n_workers']
    n_checkpoints = n_steps // save_every

    #load environment with config variables
    env_obj = getattr(rl.environments, args.environment)
    env = env_obj(config)

    # multiprocess environment
    env_8 = make_vec_env(lambda: env, n_envs=n_workers)

    # callback for evaluation
    eval_callback = EvalCallback(env,
                                 best_model_save_path=join(
                                     specified_path,
                                     'Bestmodel_{}'.format(args.name)),
                                 log_path=specified_path,
                                 eval_freq=10000,
                                 n_eval_episodes=10,
                                 verbose=1,
                                 deterministic=False,
                                 render=False)

    #train model
    try:
        try:
            model_path = join(specified_path, 'best_model_x.zip')
            model = PPO2.load(model_path,
                              env=env_8,
                              tensorboard_log=specified_path)
            #model = PPO2('MlpPolicy', env=env_8, tensorboard_log=specified_path, **model_config).load(args.modelpath, env=env_8)
            print("model loaded")
    log_dir = f"{algo}-{policy}-{tag}"
    logger.configure(folder=log_dir)

    env = gym.make("SlimeVolley-v0")
    env.atari_mode = True
    env.survival_bonus = True
    env.__init__()
    env.seed(seed)

    eval_env = gym.make("SlimeVolley-v0")
    eval_env.atari_mode = True
    eval_env.__init__()
    eval_env.seed(seed)
    eval_callback = EvalCallback(eval_env,
                                 best_model_save_path=log_dir,
                                 log_path=log_dir,
                                 eval_freq=eval_freq,
                                 n_eval_episodes=eval_episodes)

    print(f"Beginning training for {algo}-{policy}-{tag}.")
    params = {
        'policy': policyFn,
        'train_env': env,
        'eval_env': eval_env,
        'timesteps': timesteps,
        'eval_callback': eval_callback,
    }
    opt_params = {
        'a2c': {
            'gamma': [0.900, 0.999],
            'vf_coef': [0.10, 0.40],
Ejemplo n.º 14
0
def run(run_name, existing_model):

    # Create log dir
    log_dir = "./monitor_logs/"
    os.makedirs(log_dir, exist_ok=True)

    print("Setting up environment...")
    env = gym_super_mario_bros.make('SuperMarioBrosRandomStages-v0')
    env = JoypadSpace(env, SIMPLE_MOVEMENT)
    env = EpisodicLifeEnv(env)

    # Preprocessing
    env = WarpFrame(env)
    env = FrameStack(env, n_frames=hp.FRAME_STACK)

    # Evaluate every kth frame and repeat action
    env = MaxAndSkipEnv(env, skip=hp.FRAME_SKIP)

    # Logs will be saved in log_dir/monitor.csv
    env = Monitor(env, log_dir)

    # Save a checkpoint every 1000 steps
    checkpoint_callback = CheckpointCallback(save_freq=25000,
                                             save_path='./models/',
                                             name_prefix=run_name)

    eval_callback = EvalCallback(env,
                                 best_model_save_path='./models/',
                                 log_path='./models/',
                                 eval_freq=250000,
                                 deterministic=True,
                                 render=False)

    print("Compiling model...")

    if existing_model:
        try:
            model = DQN.load(existing_model,
                             env,
                             tensorboard_log="./mario_tensorboard/")
        except:
            print(f"{existing_model} does not exist!")
            exit(0)
    else:
        model = DQN(
            LnCnnPolicy,
            env,
            batch_size=hp.
            BATCH_SIZE,  # Optimizable (higher batch sizes ok according to https://arxiv.org/pdf/1803.02811.pdf)
            verbose=1,
            learning_starts=10000,
            learning_rate=hp.LEARNING_RATE,
            exploration_fraction=hp.EXPLORATION_FRACT,
            exploration_initial_eps=1.0,
            exploration_final_eps=0.1,
            prioritized_replay=True,
            prioritized_replay_alpha=hp.P_REPLAY_ALPHA,
            train_freq=hp.TRAINING_FREQ,
            target_network_update_freq=hp.TARGET_UPDATE_FREQ,
            tensorboard_log="./mario_tensorboard/")

    print("Training starting...")
    with ProgressBarManager(hp.TIME_STEPS) as progress_callback:
        model.learn(
            total_timesteps=hp.TIME_STEPS,
            log_interval=1,
            callback=[progress_callback, checkpoint_callback, eval_callback],
            tb_log_name=run_name)

    print("Done! Saving model...")
    model.save("models/{}_final".format(run_name))
Ejemplo n.º 15
0
    logger.configure(folder=saver.data_dir)
    env_id = 'gym_docking:docking-v2'
    num_cpu = 10  # Number of processes to use
    # Create the vectorized environment
    env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

    # env = DummyVecEnv([make_env(env_id, i) for i in range(num_cpu)])
    # [lambda: gym.make("gym_docking:docking-v0")])

    # Stable Baselines provides you with make_vec_env() helper
    # which does exactly the previous steps for you:
    # env = make_vec_env(env, n_envs=num_cpu, seed=0)
    eval_env = gym.make('gym_docking:docking-v2')
    eval_callback = EvalCallback(
        eval_env,
        best_model_save_path='./logs/best_shaping_moving_b_10M_model',
        log_path='./logs/best_shaping_moving_b_10M_results',
        eval_freq=600)

    checkpoint_callback = CheckpointCallback(
        save_freq=int(5e4),
        save_path='./logs/',
        name_prefix='rl_model_621_shaping_moving_b_10M')

    # Create the callback list
    callback = CallbackList([checkpoint_callback, eval_callback])

    lr_sch = LinearSchedule(int(10e6), 1.0e-5, 2.5e-4)

    model = PPO2(
        policy=MlpPolicy,
from env_suite.envs import controlTableLine
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common import make_vec_env
from stable_baselines.common.callbacks import EvalCallback
from stable_baselines import PPO2
import datetime, os

logdir = os.path.join(os.getcwd(), "logs",
                      datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))

eval_env = controlTableLine()
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=
    '/gdrive/My Drive/Code/RL/controlTableLine/best_model/',
    eval_freq=100000,
    deterministic=True,
    render=False)

env = make_vec_env(controlTableLine, n_envs=10)

modelname = 'PPO2_controlTableLine'

model = PPO2('MlpPolicy', env, verbose=1, tensorboard_log=logdir)
model.learn(total_timesteps=5000000, callback=eval_callback)
model.save("../custom_models/" + modelname)
# Separate evaluation env
if SAVE_BEST_FOR_20:
    eval_env = DummyVecEnv([lambda: gym.make(ENV_NAME)])
    if NORMALIZE:
        eval_env = VecNormalize(
            eval_env,
            training=True,
            norm_obs=True,
            norm_reward=False,
            clip_reward=1e6,
        )

    eval_callback = EvalCallback(eval_env,
                                 best_model_save_path=model_name[:-4],
                                 n_eval_episodes=30,
                                 eval_freq=5000,
                                 deterministic=True,
                                 render=False,
                                 verbose=1)

    def save_the_model():
        shutil.move(model_name[:-4] + '/best_model.zip', model_name)
        try:
            os.rmdir(model_name[:-4])
            print('Successfully saved the model.')
        except Exception as e:
            print(e)


def evaluate_policy_on_env(env,
                           model,
Ejemplo n.º 18
0
def main(args):

    rank = MPI.COMM_WORLD.Get_rank()

    model_dir = os.path.join(config.MODELDIR, args.env_name)

    if rank == 0:
        try:
            os.makedirs(model_dir)
        except:
            pass
        if args.reset:
            reset_files(model_dir)
        logger.configure(config.LOGDIR)
    else:
        logger.configure(format_strs=[])

    if args.debug:
        logger.set_level(config.DEBUG)
    else:
        time.sleep(5)
        logger.set_level(config.INFO)

    workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)

    logger.info('\nSetting up the selfplay training environment opponents...')
    base_env = get_environment(args.env_name)
    env = selfplay_wrapper(base_env)(opponent_type=args.opponent_type,
                                     verbose=args.verbose)
    env.seed(workerseed)

    CustomPolicy = get_network_arch(args.env_name)

    params = {
        'gamma': args.gamma,
        'timesteps_per_actorbatch': args.timesteps_per_actorbatch,
        'clip_param': args.clip_param,
        'entcoeff': args.entcoeff,
        'optim_epochs': args.optim_epochs,
        'optim_stepsize': args.optim_stepsize,
        'optim_batchsize': args.optim_batchsize,
        'lam': args.lam,
        'adam_epsilon': args.adam_epsilon,
        'schedule': 'linear',
        'verbose': 1,
        'tensorboard_log': config.LOGDIR
    }

    time.sleep(
        5
    )  # allow time for the base model to be saved out when the environment is created

    if args.reset or not os.path.exists(
            os.path.join(model_dir, 'best_model.zip')):
        logger.info('\nLoading the base PPO agent to train...')
        model = PPO1.load(os.path.join(model_dir, 'base.zip'), env, **params)
    else:
        logger.info(
            '\nLoading the best_model.zip PPO agent to continue training...')
        model = PPO1.load(os.path.join(model_dir, 'best_model.zip'), env,
                          **params)

    #Callbacks
    logger.info(
        '\nSetting up the selfplay evaluation environment opponents...')
    callback_args = {
        'eval_env':
        selfplay_wrapper(base_env)(opponent_type=args.opponent_type,
                                   verbose=args.verbose),
        'best_model_save_path':
        config.TMPMODELDIR,
        'log_path':
        config.LOGDIR,
        'eval_freq':
        args.eval_freq,
        'n_eval_episodes':
        args.n_eval_episodes,
        'deterministic':
        False,
        'render':
        True,
        'verbose':
        0
    }

    if args.rules:
        logger.info(
            '\nSetting up the evaluation environment against the rules-based agent...'
        )
        # Evaluate against a 'rules' agent as well
        eval_actual_callback = EvalCallback(
            eval_env=selfplay_wrapper(base_env)(opponent_type='rules',
                                                verbose=args.verbose),
            eval_freq=1,
            n_eval_episodes=args.n_eval_episodes,
            deterministic=args.best,
            render=True,
            verbose=0)
        callback_args['callback_on_new_best'] = eval_actual_callback

    # Evaluate the agent against previous versions
    eval_callback = SelfPlayCallback(args.opponent_type, args.threshold,
                                     args.env_name, **callback_args)

    logger.info('\nSetup complete - commencing learning...\n')

    model.learn(total_timesteps=int(1e9),
                callback=[eval_callback],
                reset_num_timesteps=False,
                tb_log_name="tb")

    env.close()
    del env
Ejemplo n.º 19
0
def train(supply_distribution: Tuple[dict, list],
          demand_distribution: Tuple[dict, list],
          model_name: str,
          demand: int,
          max_day: int,
          training_timesteps_list: str,
          tblog: str,
          max_age: int = 35,
          obs_method: int = 1,
          doi: int = 4) -> str:
    """
    Train the agent
    First train without evaluation
    Second train with in-training evaluation

    :param demand_distribution: dict of {blood group : prevalence }, list of antigens included of the demand
    :param supply_distribution: dict of {blood group : prevalence }, list of antigens included of the supply
    :param model_name: str: name of the model to be stored
    :param demand: int: number of blood that is supplied / requested
    :param max_day: int: number of days per episode
    :param training_timesteps_list: list: [number of episodes without evaluation, number of episodes with evaluation]
    :param tblog: str, name of the tensorboard log
    :param max_age: int, max age of the RBCs
    :param obs_method: int, 1 or 2: item requested one-hot-encoded (1) or binary (2)
    :param doi: int, number of days of inventory
    :return: file name: str, name of the model that is stored
    """
    # Initialize parameters
    GAMMA = round(1 - (1 / (35 * demand)), 5)  # 0.993
    state_type = 'custom_category'

    time_string = datetime.now().strftime("%Y_%m_%d_%H_%M")
    file_name = time_string + model_name

    max_reward = max_day * demand * 0.1

    # Create environment
    env = environment.Env(supply_distribution[0],
                          demand_distribution[0],
                          max_age,
                          demand,
                          doi=doi,
                          obs_method=obs_method,
                          state_type=state_type,
                          max_day=max_day,
                          file_name=file_name,
                          verbose=0)
    env = DummyVecEnv([lambda: env])
    model = PPO2(MlpPolicy,
                 env,
                 gamma=GAMMA,
                 verbose=0,
                 tensorboard_log="results/tensorboard_data/" + tblog +
                 "/")  # create model

    # Train the model without evaluation (=faster)
    print('start phase 1, without evaluation')
    model.learn(total_timesteps=training_timesteps_list[0],
                tb_log_name=file_name)
    # TB- run: tensorboard --logdir ./tblog/

    # Export
    model.save('results/model/' + file_name)  # Save for backup

    callback_on_best = StopTrainingOnDecayingRewardThreshold(
        max_reward=max_reward,
        episode_decay=training_timesteps_list[2],
        reward_decay=0.05,
        no_reward_episodes=training_timesteps_list[0],
        verbose=1)

    # Callback for evaluation
    eval_callback = EvalCallback(
        env,  # callback_on_new_best=callback_on_best,
        best_model_save_path='results/model/' + file_name,
        eval_freq=50000,
        verbose=1,
        n_eval_episodes=5)

    # Train the model with eval every 50000 steps
    print('start phase 2 with evaluation')
    model.learn(total_timesteps=training_timesteps_list[1],
                tb_log_name=file_name,
                callback=eval_callback,
                reset_num_timesteps=False
                )  # train the model and run tensorboard 5000000 1500000

    # Export
    model.save('results/model/' + file_name + 'end')  # Save for backup

    # Extract the tensorboard data
    data_extract.extract_tb(file_name)

    return file_name
    # Create the vectorized environment
    #env = environment(x,y,z,0.95, 0.05, savepath, 'MlpPolicy', rg_prob='loadenv')
    env = environment(
        x, y, z, gamma, turnspc, policyname, rg_prob='loadenv'
    )  #SubprocVecEnv([make_env(x,y,z, i) for i in range(num_cpu)])
    #eval_env=environment(x, y, z, gamma, turnspc, savepath, policyname, rg_prob='loadenv')
    # Stable Baselines provides you with make_vec_env() helper
    # which does exactly the previous steps for you:
    # env = make_vec_env(env_id, n_envs=num_cpu, seed=0)

    #create callbacks to record data, initiate events during training.
    callbacklist = CallbackList([
        TimeLimit(episodetimesteps),
        EvalCallback(env,
                     log_path=savepath,
                     n_eval_episodes=1,
                     eval_freq=10000,
                     deterministic=det,
                     best_model_save_path=savepath)
    ])

    if (os.path.exists("%s/best_model.zip" % savepath)):
        # Instantiate the agent
        model = ACER(policy,
                     env,
                     gamma=gamma,
                     n_steps=episodetimesteps,
                     learning_rate=LR,
                     buffer_size=10000,
                     verbose=1)
        # Load the trained agent
        model = ACER.load("%s/best_model" % savepath, env=env)
Ejemplo n.º 21
0
        return env
    set_global_seeds(seed)
    return _init


if __name__ == '__main__':

    num_cpu = 15 # Number of processes to use
    # Create the vectorized environment
    env = SubprocVecEnv([make_env(x,y,z, i) for i in range(num_cpu)])
    eval_env=environment(x,y,z,gamma)
    # Stable Baselines provides you with make_vec_env() helper
    # which does exactly the previous steps for you:
    # env = make_vec_env(env_id, n_envs=num_cpu, seed=0)
    scenario=str(f'{inputfile_s}_t{test}_lr{LR_s}_gamma{gamma_s}_batch{batch_size}')    
    callbacklist=CallbackList([TimeLimit(episodetimesteps), EvalCallback(eval_env, log_path=scenario, n_eval_episodes=20
                                                                         , deterministic=False, best_model_save_path=scenario)])
    

        
    model = A2C(MlpPolicy, env, gamma=gamma, n_steps=batch_size, learning_rate=LR,  verbose=1, lr_schedule='constant')#, tensorboard_log=scenario)
    model.learn(total_timesteps=episodetimesteps**99, callback=callbacklist)
    

    filename= './%s/evaluations.npz' % scenario

    data=np.load(filename)
    results=data['results']
    y=np.average(results, axis=1)
    timesteps=data['timesteps']
    plt.plot(timesteps,y)
Ejemplo n.º 22
0
def objective(params):
    """
    Objective function to be minimized.

    Parameters
    ----------
    * params [list, len(params)=n_hyperparameters]
        Settings of each hyperparameter for a given optimization iteration.
        - Controlled by hyperspaces's hyperdrive function.
        - Order preserved from list passed to hyperdrive's hyperparameters argument.
     """
    config_path = join(path, 'rl', 'config', '{}.yml'.format(args.env_name))
    with open(config_path) as f:
        config = yaml.safe_load(f)
        print('model loaded from path: {}'.format(config_path))
    
    #set the parameters
    prfd, wsag, fr, nria, nrfeq, nrfc = params
    config['environment']['positive_reward_for_divert'] = prfd
    config['environment']['wrong_sup_at_goal'] = wsag
    config['environment']['flooding_reward'] = fr
    config['environment']['neg_reward_ia'] = nria
    config['environment']['negative_reward_for_empty_queue'] = nrfeq
    config['environment']['negative_reward_for_cycle'] = nrfc
    
    print('Current settings for the config: \n\npositive_reward_for_divert \t:\t{}\nwrong_sup_at_goal\t\t:\t{}\n\
flooding_reward\t\t\t:\t{}\nneg_reward_ia\t\t\t:\t{}\nnegative_reward_for_empty_queue\t:\t{}\n\
negative_reward_for_cycle\t:\t{}\n'.format(prfd, wsag, fr, nria, nrfeq, nrfc))
    
    #GET MODEL CONFIG
    model_config = config['models']['PPO2']
    policy = config['main']['policy']
    n_workers = config['main']['n_workers']
    n_steps = config['main']['n_steps']
    n_eval = (n_steps / 8)/10
    
    # load environment with config variables
    env_obj = getattr(rl.environments, args.env_name)
    env = env_obj(config)
    
    # multiprocess environment
    env_8 = make_vec_env(lambda: env, n_envs=n_workers)
    
    #define folder and path
    now = datetime.datetime.now()
    folder ='{}{}{}_{}{}'.format(now.year, str(now.month).zfill(2), str(now.day).zfill(2), str(now.hour).zfill(2), str(now.minute).zfill(2))
    specified_path = join(path, 'rl', 'trained_models', args.env_name, 'hyper-parameter', '{}-{}{}{}{}{}{}'.format(folder, prfd, wsag, fr, nria, nrfeq, nrfc))
    print('Results stored in: {}'.format(specified_path))
    
    # callback for evaluation
    eval_callback = EvalCallback(env, best_model_save_path=specified_path,
                                 log_path=specified_path, eval_freq=n_eval,
                                 n_eval_episodes=5, verbose=0,
                                 deterministic=False, render=False)

    model = PPO2(policy, env=env_8, tensorboard_log=specified_path, **model_config)
    
    #LEARN MODEL
    model.learn(total_timesteps=n_steps, tb_log_name='{}_{}_{}_{}_{}_{}'.format(prfd, wsag, fr, nria, nrfeq, nrfc),
                        callback=eval_callback)
    model_path = join(specified_path, 'model_{}_{}_{}_{}_{}_{}.zip'.format(prfd, wsag, fr, nria, nrfeq, nrfc))
    model.save(model_path)
    
    #test
    best_modelpath = join(specified_path, 'best_model.zip')
    test_model = PPO2.load(best_modelpath, env=DummyVecEnv([lambda: env]))
    
    #run test of the model
    episodes = 10
    results = {}
    results['cycle_count'] = 0
    results['idle_time'] = 0
    for episode in range(episodes):
        # Run an episode
        state = env.reset()
        done = False
        meta_data = []
        while not done:
            action, _ = test_model.predict(state, deterministic=True)
            state, reward, done, _ = env.step(action)
            if done:
                results['cycle_count'] += env.cycle_count
                results['idle_time'] += sum(env.idle_times_operator.values())
    
    return (results['cycle_count'] + results['idle_time']) /episodes
Ejemplo n.º 23
0
env = gym.make('RPiLEDEnv-v0',
               resizeCamImagePct=50,
               ledHSVLower=np.array([0, 0, 252]),
               ledHSVHigher=np.array([31, 9, 255]),
               rPiIP='192.168.0.183',
               rPiPort=50000,
               episodeLength=100,
               bullseye=10)

callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-20,
                                                 verbose=1)

eval_callback = EvalCallback(env,
                             best_model_save_path='./logs/best',
                             log_path='./logs/',
                             eval_freq=500,
                             deterministic=True,
                             render=False,
                             callback_on_new_best=callback_on_best)

# Added checkpoint because I lost model data after a crash when the webcam shutdown because the screen went to sleep :(
checkpoint_callback = CheckpointCallback(save_freq=1000,
                                         save_path='./logs/',
                                         name_prefix='ppo1_model')

cb = CallbackList([eval_callback, checkpoint_callback])

model = DQN(MlpPolicy,
            env,
            verbose=1,
            double_q=True,
Ejemplo n.º 24
0
                                            sigma=float(0.5) *
                                            np.ones(n_actions))

# Stable Baseline이 제공하는 알고리즘 중 DDPG를 선택하고
model = DDPG(MlpPolicy,
             env,
             verbose=1,
             param_noise=param_noise,
             action_noise=action_noise)

# Separate evaluation env
eval_env = Manipulator2D()
# Use deterministic actions for evaluation
eval_callback = EvalCallback(eval_env,
                             best_model_save_path='./logs/',
                             log_path='./logs/',
                             eval_freq=500,
                             deterministic=True,
                             render=False)

# 400,000 timestep 동안 시뮬레이션을 실행하며 학습한다.
model.learn(total_timesteps=400000, callback=eval_callback)
model.learn(total_timesteps=400000)

# 학습된 결과를 저장한다.
# 연습 : 400,000 timestep 동안 학습한 결과가 아닌, 학습 도중 가장 좋은 reward 값을 반환한 policy network를 저장하려면 어떻게 해야할까요?
# Tip : learn 함수에 callback function을 사용해봅시다.
model.save("ddpg_manipulator2D")

# model 변수를 제거
del model  # remove to demonstrate saving and loading
Ejemplo n.º 25
0
    num_cpu = ncpu  # Number of processes to use
    # Create the vectorized environment
    env = SubprocVecEnv([make_env(x, y, z, i) for i in range(num_cpu)])
    eval_env = evalenv(x, y, z, gamma, turnspc, policyname)
    env1 = environment(x, y, z, gamma, turnspc,
                       policyname)  #env annealreate/ numturns*eval_freq
    # Stable Baselines provides you with make_vec_env() helper
    # which does exactly the previous steps for you:
    # env = make_vec_env(env_id, n_envs=num_cpu, seed=0)

    #create callbacks to record data, initiate events during training.
    callbacklist = CallbackList([
        TimeLimit(episodetimesteps),
        EvalCallback(eval_env,
                     log_path=evpath,
                     n_eval_episodes=100,
                     eval_freq=50000,
                     deterministic=False,
                     best_model_save_path=evpath),
        EvalCallback(env1,
                     log_path=savepath,
                     n_eval_episodes=20,
                     eval_freq=50000,
                     deterministic=False,
                     best_model_save_path=savepath)
    ])
    if (os.path.exists("%s/best_model.zip" % savepath)):
        # Instantiate the agent
        model = A2C(policy,
                    env,
                    gamma=gamma,
                    n_steps=episodetimesteps,
Ejemplo n.º 26
0
from Config import Config
from Result import Result

# Inits Battleship gym environments and config
config = Config(5, [3, 2, 2], True, False, False)
env2 = gym.make('Battleships-v0', config=config)
env3 = gym.make('Battleships-v0', config=config)
env = DummyVecEnv([lambda: env2])
env4 = DummyVecEnv([lambda: env3])
check_env(env2, warn=True)

# Define Callback
#Callback stops training if maximum is reached in mean reward
callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=env2.calculate_threshold(), verbose=1)
# Callback safes the currently best model
eval_callback = EvalCallback(env4, callback_on_new_best=callback_on_best, verbose=1, best_model_save_path='./DQN_Models/best/')
checkpoint_callback = CheckpointCallback(save_freq=1e4, save_path='./model_checkpoints/')

# Uncomment, to train a new fresh model, otherwise a allready trained model will be trained
#model = DQN(MlpPolicy, env, verbose=2, tensorboard_log="./logs/progress_tensorboard/")

# Load current best model
model = DQN.load("DQN_Models/dqn_5x5_3_SingleShot.zip", verbose=2, env=env, tensorboard_log="./logs/progress_tensorboard/")

# Train model
model.learn(total_timesteps=1000000, callback=[checkpoint_callback, eval_callback])

#Delete current model and load the best model
del model
model = DQN.load("./DQN_Models/best/best_model.zip", verbose=2, env=env, tensorboard_log="./logs/progress_tensorboard/")
                                                   {self.obs_ph: obs})
        else:
            action, value, neglogp = self.sess.run([self.action, self.value_flat, self.neglogp],
                                                   {self.obs_ph: obs})
        return action, value, self.initial_state, neglogp

    def proba_step(self, obs, state=None, mask=None):
        return self.sess.run(self.policy_proba, {self.obs_ph: obs})

    def value(self, obs, state=None, mask=None):
        return self.sess.run(self.value_flat, {self.obs_ph: obs})
    
    def value_aux(self,obs):
        return self.sess.run(self.value_flat_aux, {self.obs_ph: obs})

if __name__ == "__main__":
    env = VecFrameStack(make_atari_env("PongNoFrameskip-v4",num_env=1,seed=0),4)
    envs = VecFrameStack(make_atari_env("PongNoFrameskip-v4",num_env=5,seed=0),4)

    checkpoint_callback = CheckpointCallback(save_freq=5000, save_path='./checkpoints/',name_prefix='PPO2')
    callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=25, verbose=1)
    eval_callback = EvalCallback(env, best_model_save_path='./checkpoints/best/',log_path='./board_logs/', \
                                eval_freq=5000,deterministic=True, render=False, callback_on_new_best=callback_on_best)

    # envs = VecFrameStack(make_vec_env(ALP_gym,n_envs=5),4)
    model = PPG(PPG_CNN, envs, verbose=0,gamma=0.99, tensorboard_log="./board_logs")
    # model = PPO2.load("./checkpoints/best/best_model",env = envs, tensorboard_log="./board_logs/")
    # model = PPO2.load("PPO",env = envs, tensorboard_log="./board_logs/")
    model.learn(int(5e6))
    # model.save("PPO_Attention")
    num_cpu = 15  # Number of processes to use
    # Create the vectorized environment
    env = SubprocVecEnv([make_env(x, y, z, i) for i in range(num_cpu)])
    eval_env = environment(x, y, z, gamma, cutoffpenaltyscalar, rg_prob,
                           turnspc, savepath)
    # Stable Baselines provides you with make_vec_env() helper
    # which does exactly the previous steps for you:
    # env = make_vec_env(env_id, n_envs=num_cpu, seed=0)

    #create callbacks to record data, initiate events during training.
    callbacklist = CallbackList([
        TimeLimit(episodetimesteps),
        EvalCallback(eval_env,
                     log_path=savepath,
                     n_eval_episodes=5,
                     deterministic=False,
                     best_model_save_path=savepath)
    ])

    #create model with Stable Baselines package.
    model = A2C(CnnPolicy,
                env,
                gamma=gamma,
                n_steps=updatesteps,
                learning_rate=LR,
                verbose=1)  #, tensorboard_log=scenario)
    model.learn(
        total_timesteps=episodetimesteps**50, callback=callbacklist
    )  #total timesteps set to very large number so program will terminate based on runtime parameter)
Ejemplo n.º 29
0

if __name__ == '__main__':

    num_cpu = ncpu  # Number of processes to use
    # Create the vectorized environment
    env = SubprocVecEnv([make_env(x,y,z, i) for i in range(num_cpu)])
    eval_env=evalenv(x, y, z, gamma, turnspc, policyname)
    #env1 =environment(x, y, z, gamma, turnspc, policyname)
    # Stable Baselines provides you with make_vec_env() helper
    # which does exactly the previous steps for you:
    # env = make_vec_env(env_id, n_envs=num_cpu, seed=0)

    
    #create callbacks to record data, initiate events during training.
    callbacklist=CallbackList([TimeLimit(episodetimesteps), EvalCallback(eval_env, log_path=evpath, n_eval_episodes=100, eval_freq=50000
                                                                         , deterministic=False, best_model_save_path=evpath), EvalCallback(env, log_path=savepath, n_eval_episodes=20, eval_freq=10000
                                                                         , deterministic=False, best_model_save_path=savepath)])
    if (os.path.exists("%s/best_model.zip" % savepath)):
        # Instantiate the agent
        model = ACER(policy, env, gamma=gamma, n_steps=episodetimesteps, learning_rate=LR,  buffer_size=10000,  verbose=1, n_cpu_tf_sess=num_cpu)
        # Load the trained agent
        model = ACER.load("%s/best_model" % savepath, env=env)
        print('loaded agent')
        model.learn(total_timesteps=episodetimesteps**50, callback=callbacklist) #total timesteps set to very large number so program will terminate based on runtime parameter)
        
        
    else:
        #create model with Stable Baselines package.
        model = ACER(policy, env, gamma=gamma, n_steps=episodetimesteps, learning_rate=LR,  buffer_size=10000,  verbose=1, n_cpu_tf_sess=num_cpu)#, tensorboard_log=scenario)
        #model = ACER.load("%s/best_model" % savepath, env)
        model.learn(total_timesteps=episodetimesteps**50, callback=callbacklist) #total timesteps set to very large number so program will terminate based on runtime parameter)
def main():
    """ Prepare for trainings """
    log_dir, model_dir = prepare_dirs()

    model_name = model_dir + '/' + MODEL_NAME
    print(f'model will be saved as {model_name}')

    log_dir = log_dir + '/' + MODEL_NAME
    """ Generate & Check environment """
    env_name = ENV_NAME
    env = gym.make(env_name)
    # print(f'Observation space: {env.observation_space}')
    # print(f'Action space: {env.action_space}')
    # env = Monitor(env, log_dir, allow_early_resets=True)
    # check_env(env)
    """ Save config as pickle file """
    config = summarize_config(env)
    save_config(log_dir, config)
    """ Vectorize environment """
    num_envs = NUM_ENVS
    env = DummyVecEnv([lambda: env for _ in range(num_envs)])  # For training

    eval_env = DummyVecEnv([lambda: gym.make(env_name)])  # For evaluation
    """ Define checkpoint callback """
    checkpoint_callback = CheckpointCallback(save_freq=SAVE_FREQ,
                                             save_path=model_name,
                                             name_prefix=MODEL_NAME)
    """ Use deterministic actions for evaluation callback """
    eval_callback = EvalCallback(eval_env,
                                 best_model_save_path=model_name,
                                 log_path=log_dir,
                                 eval_freq=EVAL_FREQ,
                                 deterministic=True,
                                 render=False,
                                 n_eval_episodes=N_EVAL_EPISODES)

    print(f'Algorithm: {ALGORITHM}\n')

    if not CONTINUAL_LEARNING:
        """ Define model """
        model = define_model(env, log_dir)
    else:
        model = load_model(env, model_dir, log_dir)
    """ Evaluate model before training """
    # mean_reward, std_reward = evaluate_policy(model=model,
    #                                          env=eval_env,
    #                                          n_eval_episodes=N_EVAL_EPISODES)
    # print(f'Before training: mean reward: {mean_reward:.2f} +/- {std_reward:.2f}')
    """ Train model """
    model.learn(total_timesteps=MAX_STEPS,
                callback=[checkpoint_callback, eval_callback])
    """ Evaluate model after training """
    # mean_reward, std_reward = evaluate_policy(model=model,
    #                                          env=eval_env,
    #                                          n_eval_episodes=N_EVAL_EPISODES)
    # print(f'After training: mean reward: {mean_reward:.2f} +/- {std_reward:.2f}')
    """ Save trained model """
    model.save(model_name)
    """ Test trained model """
    obs = eval_env.reset()
    for i in range(N_EVAL_EPISODES):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = eval_env.step(action)
        eval_env.render()

    env.close()
    eval_env.close()