Example #1
0
def main():
    env = DummyVecEnv([lambda: WeightEnv()])
    env.env_method("seed", 0)
    model = PPO2(MlpPolicy, env, tensorboard_log="/tmp/foo")
    model.learn(total_timesteps=1000000)
    obs = env.reset()
    print('position   velocity   accel      jerk       reward')
    for i in range(2000):
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        if done:
            printit(info[0]['terminal_observation'], rewards[0])
            print("")
        printit(obs[0], rewards[0])
Example #2
0
def test_agent(agent_step):
    for coef_index in range(len(CLAC_COEFS)):
        mut_coef = CLAC_COEFS[coef_index]

        if (agent_step == 1):
            print(mut_coef, "  ", NUM_TRAINING_STEPS, "  ", ENVIRONMENT_NAME,
                  "  ", FOLDER)

        features = pd.DataFrame()

        mirl_env = gym.make(ENVIRONMENT_NAME)
        mirl_env = DummyVecEnv([lambda: mirl_env])

        mirl_model = CLAC(CLAC_MlpPolicy,
                          mirl_env,
                          mut_inf_coef=mut_coef,
                          coef_schedule=3.3e-4,
                          verbose=1)

        (mirl_model, learning_results) = mirl_model.learn(
            total_timesteps=NUM_TRAINING_STEPS, log_interval=10)
        learning_results['AgentID'] = agent_step
        learning_results.to_pickle(FOLDER + "/results/MIRL_" +
                                   str(mut_coef).replace(".", "p") + "_" +
                                   str(agent_step) + "_0.pkl")

        for resample_step in range(1, NUM_RESAMPLES):
            # Set both environments to the same resampled values
            if (RANDOMIZATION_LEVEL == "Normal"):
                mirl_env.env_method("randomize", 0)
            elif (RANDOMIZATION_LEVEL == "Extreme"):
                mirl_env.env_method("randomize", 1)
            elif (RANDOMIZATION_LEVEL == "Test"):
                mirl_env.env_method("randomize", -1)
            else:
                print("Error resampling unknown value: ", RANDOMIZATION_LEVEL)
                continue

            if (agent_step == 1):
                print(mut_coef, "  ", NUM_TRAINING_STEPS, "  ",
                      ENVIRONMENT_NAME, "  ", FOLDER, " resample step ",
                      resample_step)

            (mirl_model, learning_results) = mirl_model.learn(
                total_timesteps=NUM_TRAINING_STEPS,
                reset_num_timesteps=False,
                log_interval=10)
            learning_results.to_pickle(FOLDER + "/results/MIRL_" +
                                       str(mut_coef).replace(".", "p") + "_" +
                                       str(agent_step) + "_" +
                                       str(resample_step) + ".pkl")

        mirl_model.save(FOLDER + "/models/MIRL_" +
                        str(mut_coef).replace(".", "p") + "_" +
                        str(agent_step) + "_0")

        del mirl_model
        del mirl_env
Example #3
0
def main():
    env = DummyVecEnv([lambda: EngineEnv()])
    #env = VecNormalize(env, norm_obs=True, norm_reward=False, clip_obs=10.)
    #env = VecNormalize(env)
    env.env_method("seed", 0)
    # more value function loss
    model = PPO2(MlpPolicy, env, vf_coef=10.0, tensorboard_log="/tmp/foo")
    #model = A2C(MlpPolicy, env, tensorboard_log="/tmp/foo")
    #model = SAC(MlpPolicy, env, tensorboard_log="/tmp/foo")
    #model = SAC(CustomSACPolicy, env, tensorboard_log="/tmp/foo")
    model.learn(total_timesteps=1000000)
    #model.learn(total_timesteps=400000)
    #model.learn(total_timesteps=500000)
    obs = env.reset()
    print('map        far_err    afr        reward')
    #      1234567890 1234567890 1234567890 1
    for i in range(2000):
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        #if done:
        #    printit(info[0]['terminal_observation'], rewards[0])
        #    print("")
        printit(obs[0], rewards[0])
Example #4
0
            checkpoint = 'fresh'
    elif args.checkpoint in ["BEST", "best", "Best"]:
        checkpoint = bestperformingcheckpoint(
            os.path.join("./saves/", args.tag))
    else:
        checkpoint = os.path.join("./saves/", args.tag,
                                  args.checkpoint.split('/')[-1])

    if not args.test:
        env = DummyVecEnv([
            lambda: env_generator(ep_len=args.episode_length,
                                  total_sweeps=args.total_sweeps)
        ])
        #env = VecNormalize(env, norm_obs=False, norm_reward=True, training=True)

        env.env_method('set_experiment_tag', indices=[0], tag=args.tag)
        env.env_method('init_HamiltonianGetter', indices=[0], phase='TRAIN')
        env.env_method('set_max_ep_length',
                       indices=[0],
                       max_ep_length=args.episode_length)

        print("Attempting to restore model")
        try:
            print("Loading model")
            model = PPO2.load(checkpoint, env=env, **model_args)
        except Exception as EEE:
            print(EEE)
            print("ERROR restoring model. Starting from scratch")
            print("Initializing model")
            model = PPO2(
                env=env,
Example #5
0
def validation(checkpoint_name,
               num_hamiltonians=20,
               num_trials=10,
               mode='validation'):
    tf.config.set_soft_device_placement(True)
    with tf.device("/gpu:1"):
        env = DummyVecEnv([
            lambda: env_generator(ep_len=args.episode_length,
                                  total_sweeps=args.total_sweeps)
        ])

        env.env_method('set_experiment_tag', indices=[0], tag=args.tag)
        env.env_method('set_max_ep_length',
                       indices=[0],
                       max_ep_length=args.episode_length)
        if mode == 'test':
            env.env_method("toggle_datadump_on", indices=[0])
        env.env_method('init_HamiltonianGetter',
                       indices=[0],
                       phase='TEST',
                       directory=args.hamiltonian_directory)

        model = PPO2.load(checkpoint_name, env=env, **model_args)

        env = model.get_env()

        env.env_method("init_HamiltonianSuccessRecorder",
                       indices=[0],
                       num_hamiltonians=num_hamiltonians,
                       num_trials=num_trials)
        env.env_method("set_static_Hamiltonian_by_ID", indices=[0], ID=0)

        if args.destructive:
            env.env_method('set_destructive_observation_on', indices=[0])

        obs = env.reset()

        test_ep = -1
        inftime = 0
        envtime = 0
        count = 0

        for ham in range(num_hamiltonians):
            env.env_method("set_static_Hamiltonian_by_ID", indices=[0], ID=ham)
            for trial in range(num_trials):
                test_ep += 1
                state = None
                done = [False for _ in range(env.num_envs)]
                step = -1
                while True:
                    step += 1
                    tick = time.time()
                    action, state = model.predict(obs,
                                                  state=state,
                                                  mask=done,
                                                  deterministic=True)
                    tock = time.time()
                    if step > 3 and step < 35:
                        inftime += tock - tick

                    tick = time.time()
                    obs, reward, d, _ = env.step(action)
                    tock = time.time()
                    if step > 3 and step < 35:
                        envtime += tock - tick
                        count += 1

    #                if test_ep==10000:
    #                    env.env_method("toggle_datadump_off", indices=[0])
                    if d:
                        break
            if mode == 'test':
                env.env_method("hsr_write")
                print(f"Total inference time: {inftime}s")
                print(f"Total environment time: {envtime}s")
                print(f"Total count: {count}")
                print(f"Time per inference call: {inftime/count}")
                print(f"Time time in environment: {envtime/count}")
        if mode == 'validation':  #only want to log if in validation mode (i.e. validation during testing)
            p = env.env_method('get_hamiltonian_success_probability',
                               indices=[0])[0]
            if args.wandb_project != "disable":
                wandb.log({"Probability of success": p})
            archive = checkpoint_name.replace(
                "saved_model", f"archived_p{p:06.3f}_{uuid4()}")
            print(
                f"Archiving checkpoint. Copying {checkpoint_name} to {archive}"
            )
            shutil.copy(checkpoint_name + ".zip", archive + ".zip")
Example #6
0
rewards_time_list_fixed_2 = []
avg_rewards_time_list_fixed_2 = []
rewards_bak_list_fixed_2 = []
avg_rewards_bak_list_fixed_2 = []
rewards_bat_list_fixed_2 = []
avg_rewards_bat_list_fixed_2 = []
avg_rewards_energy_list_fixed_2 = []
fixed_2_data = []

s = 1
t_range = 100

set_seed(rand_seed)
obs = env.reset()
for i in range(t_range):
    action = env.env_method('myopic_action_cal')
    obs, rewards, dones, info = env.step(action)
    rewards_list_myopic.append(1 / rewards / s)
    avg_rewards_myopic.append(np.mean(rewards_list_myopic[:]))
    t, bak, bat = env.render()
    rewards_time_list_myopic.append(t / s)
    avg_rewards_time_list_myopic.append(np.mean(rewards_time_list_myopic[:]))
    rewards_bak_list_myopic.append(bak / s)
    avg_rewards_bak_list_myopic.append(np.mean(rewards_bak_list_myopic[:]))
    rewards_bat_list_myopic.append(bat / s)
    avg_rewards_bat_list_myopic.append(np.mean(rewards_bat_list_myopic[:]))
    avg_rewards_energy_list_myopic.append(avg_rewards_bak_list_myopic[-1] +
                                          avg_rewards_bat_list_myopic[-1])
    myopic_data.append([
        avg_rewards_time_list_myopic[-1], avg_rewards_bak_list_myopic[-1],
        avg_rewards_bat_list_myopic[-1]
Example #7
0
def solve(supply_distribution: Tuple[dict, list],
          demand_distribution: Tuple[dict,
                                     list], model_name: str, export_model: str,
          max_age: int, demand: int, doi: int, n_warm_start_days: int,
          n_days: int, obs_method: int, state_type: str) -> dict:
    """

    :param demand_distribution: Tuple[dict, list] containing a dict with {blood_group : distribution}, list of
    included antigens
    :param supply_distribution: Tuple[dict, list] containing a dict with {blood_group : distribution}, list of
    included antigens
    :param model_name: str, name of the model that is used to store the results
    :param export_model: str, name of hte model that is trained
    :param max_age: int, max age of the RBCs
    :param demand: int, number of demand / supply per day
    :param doi: days of inventory, the number of days the inventory is filled before first supply
    :param n_warm_start_days: int, number of days of warm start
    :param n_days: int, number of days for evaluation
    :param obs_method: int, 1 or 2: item requested one-hot-encoded (1) or binary (2)
    :param state_type: type of state that is used 'custom category'
    :return:
    """
    # Get model ready
    env = environment.Env(supply_distribution[0],
                          demand_distribution[0],
                          max_age,
                          demand,
                          doi,
                          obs_method=obs_method,
                          state_type=state_type,
                          file_name=model_name)
    env = DummyVecEnv([lambda: env])
    model = PPO2.load(export_model, env=env)

    # Run model
    obs = env.reset()

    # Warm start
    print('warm start - started')
    env.env_method('set_days', n_warm_start_days)
    done = False
    while not done:
        action, _states = model.predict(obs, deterministic=True)
        obs_next, rewards, done, info = env.step(action)
        obs = obs_next
    print('warm start - ended')

    # Testing
    print('Testing - started')
    env.env_method('set_days', n_days)
    env.env_method('change_eval_boolean', True)

    done = False
    while not done:
        action, _states = model.predict(obs, deterministic=True)
        obs_next, rewards, done, info = env.step(action)
        obs = obs_next

    results = env.env_method('render_blood_specific')  # get evaluation metrics
    print('Testing - ended')

    return results
Example #8
0
def test_agent(agent_step):
    for coef_index in range(len(CLAC_COEFS)):
        mut_coef = CLAC_COEFS[coef_index]
        ent_coef = SAC_COEFS[coef_index]

        if (agent_step == 1):
            print(mut_coef, "  ", ent_coef, "  ", NUM_TRAINING_STEPS, "  ",
                  ENVIRONMENT_NAME, "  ", FOLDER)

        features = pd.DataFrame()

        clac_env = gym.make(ENVIRONMENT_NAME)
        clac_env = DummyVecEnv([lambda: clac_env])
        clac_model = CLAC(CLAC_MlpPolicy,
                          clac_env,
                          mut_inf_coef=mut_coef,
                          verbose=1)

        sac_env = gym.make(ENVIRONMENT_NAME)
        sac_env = DummyVecEnv([lambda: sac_env])

        sac_model = SAC(MlpPolicy, sac_env, ent_coef=ent_coef, verbose=1)

        mirl_env = gym.make(ENVIRONMENT_NAME)
        mirl_env = DummyVecEnv([lambda: mirl_env])

        mirl_model = CLAC(CLAC_MlpPolicy,
                          mirl_env,
                          mut_inf_coef=mut_coef,
                          coef_schedule=3.3e-4,
                          verbose=1)

        (clac_model, learning_results) = clac_model.learn(
            total_timesteps=NUM_TRAINING_STEPS, log_interval=1000)
        learning_results['AgentID'] = agent_step
        learning_results.to_pickle(FOLDER + "/results/CLAC_" +
                                   str(mut_coef).replace(".", "p") + "_" +
                                   str(agent_step) + "_0.pkl")

        (sac_model, learning_results) = sac_model.learn(
            total_timesteps=NUM_TRAINING_STEPS, log_interval=1000)
        learning_results['AgentID'] = agent_step
        learning_results.to_pickle(FOLDER + "/results/SAC_" +
                                   str(ent_coef).replace(".", "p") + "_" +
                                   str(agent_step) + "_0.pkl")

        (mirl_model, learning_results) = mirl_model.learn(
            total_timesteps=NUM_TRAINING_STEPS, log_interval=1000)
        learning_results['AgentID'] = agent_step
        learning_results.to_pickle(FOLDER + "/results/MIRL_" +
                                   str(mut_coef).replace(".", "p") + "_" +
                                   str(agent_step) + "_0.pkl")

        for resample_step in range(1, NUM_RESAMPLES):
            # Set both environments to the same resampled values
            if (RANDOMIZATION_LEVEL == "Normal"):
                clac_env.env_method("randomize", 0)
            elif (RANDOMIZATION_LEVEL == "Random"):
                clac_env.env_method("randomize", 1)
            elif (RANDOMIZATION_LEVEL == "Extreme"):
                clac_env.env_method("randomize", 2)
            elif (RANDOMIZATION_LEVEL == "Test"):
                clac_env.env_method("randomize", -1)
            else:
                print("Error resampling unknown value: ", RANDOMIZATION_LEVEL)
                continue

            env_features = clac_env.env_method("get_features")[0]
            sac_env.env_method("set_features", env_features)
            mirl_env.env_method("set_features", env_features)

            if (agent_step == 1):
                print(env_features)

            Power = env_features[0]
            Density = env_features[1]
            Friction = env_features[2]
            Gravity = env_features[3]

            d = {
                "Mut Coefficient": mut_coef,
                "Ent Coefficient": ent_coef,
                "Resample Step": resample_step,
                "Power": Power,
                "Density": Density,
                "Friction": Friction,
                "Gravity": Gravity
            }
            features = features.append(d, ignore_index=True)

            (clac_model, learning_results) = clac_model.learn(
                total_timesteps=NUM_TRAINING_STEPS,
                reset_num_timesteps=False,
                log_interval=1000)
            learning_results.to_pickle(FOLDER + "/results/CLAC_" +
                                       str(mut_coef).replace(".", "p") + "_" +
                                       str(agent_step) + "_" +
                                       str(resample_step) + ".pkl")

            (sac_model, learning_results) = sac_model.learn(
                total_timesteps=NUM_TRAINING_STEPS,
                reset_num_timesteps=False,
                log_interval=1000)
            learning_results.to_pickle(FOLDER + "/results/SAC_" +
                                       str(ent_coef).replace(".", "p") + "_" +
                                       str(agent_step) + "_" +
                                       str(resample_step) + ".pkl")

            (mirl_model, learning_results) = mirl_model.learn(
                total_timesteps=NUM_TRAINING_STEPS,
                reset_num_timesteps=False,
                log_interval=1000)
            learning_results.to_pickle(FOLDER + "/results/MIRL_" +
                                       str(mut_coef).replace(".", "p") + "_" +
                                       str(agent_step) + "_" +
                                       str(resample_step) + ".pkl")

        clac_model.save(FOLDER + "/models/CLAC_" +
                        str(mut_coef).replace(".", "p") + "_" +
                        str(agent_step) + "_0")
        sac_model.save(FOLDER + "/models/SAC_" +
                       str(ent_coef).replace(".", "p") + "_" +
                       str(agent_step) + "_0")
        mirl_model.save(FOLDER + "/models/MIRL_" +
                        str(mut_coef).replace(".", "p") + "_" +
                        str(agent_step) + "_0")
        features.to_pickle(FOLDER + "/features/features_" + str(agent_step) +
                           "_" + str(mut_coef) + "_" + str(ent_coef) + ".pkl")

        #print(features)

        del sac_model
        del sac_env

        del clac_model
        del clac_env

        del mirl_model
        del mirl_env
Example #9
0
parser.add_argument("-b","--betainit", help="Initial inverse temperature", default="")
args = parser.parse_args()


silent = True

from train import episode_length
experiment_name=sys.argv[1]
experiment_description="""Reward is the negative of the minimum energy at episode termination, with no episode termination if negative beta encountered"""

beta_init = float(args.betainit)

env = DummyVecEnv([lambda: env_generator(ep_len=episode_length, total_sweeps=episode_length*100, beta_init_function=lambda: beta_init )])
env = VecNormalize(env, norm_obs=False, norm_reward=False, training=False)

env.env_method('set_experiment_tag', indices=[0], tag=args.tag)
env.env_method('set_max_ep_length', indices=[0], max_ep_length=episode_length)
env.env_method("toggle_datadump_on", indices=[0])
#env.env_method('init_HamiltonianGetter', indices=[0], phase='TEST', directory=args.hamiltonian_directory )
env.env_method('init_HamiltonianGetter', indices=[0], phase='WSC', directory=args.hamiltonian_directory )



#Attempting to restore most recently saved model
print("!!!!!!!!!!!!!!!!!!!!!!!")
max_path = mostrecentmodification(os.path.join("./saves", 'WSC'))#args.tag))
print(f"   Attempting to restore model {max_path}")
model = PPO2.load(max_path, env=env, **model_args)
print("!!!!!!!!!!!!!!!!!!!!!!!")

env = model.get_env()
Example #10
0
            delayPerTask = delayPerTask / len(info[0]['info'])
            ep_instr.append(totalInstr)
            ep_power.append(info[0]['power_consumed'])
            ep_power_per_instr.append(ep_power[-1] / ep_instr[-1])
            ep_delay_per_task.append(delayPerTask)
            wandb.log({
                'ep_instr': ep_instr[-1],
                'ep_power': ep_power[-1],
                'ep_power_per_instr': ep_power_per_instr[-1],
                'ep_delay_per_task': ep_delay_per_task[-1]
            })

        cumInstructions += totalInstr
        avgDelayPerTask += delayPerTask
        if num_test == 1:
            writeOutputFile('out.csv', info[0]['info'])
            env.env_method('graphShow', 'power')
            env.env_method('graphShow', 'temp')

avgPowerPerInstr = avgPowerPerInstr / (num_test - skip)
cumInstructions = cumInstructions / (num_test - skip)
avgDelayPerTask = avgDelayPerTask / (num_test - skip)
print("Mean Instruction Count per episode = \t" + str(cumInstructions))
print("avgPowerPerInstr = \t\t" + str(avgPowerPerInstr))
# print("cumInstructions = " + str(cumInstructions))
print("avgDelayPerTask = \t\t" + str(avgDelayPerTask))
print("skips = \t\t" + str(skip))
wandb.config.mean_ep_instr = np.mean(ep_instr)
wandb.config.mean_power = np.mean(ep_power)
wandb.config.mean_power_per_instr = np.mean(ep_power_per_instr)
wandb.config.mean_delay_per_task = np.mean(ep_delay_per_task)
from stable_baselines import PPO2
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
import argparse
from datetime import datetime

parser = argparse.ArgumentParser(
    formatter_class=argparse.ArgumentDefaultsHelpFormatter)

parser.add_argument("configPath", help="Enter config path")

args = parser.parse_args()

path = args.configPath

env = DummyVecEnv([lambda: TradingEnv(path)])

config = configparser.ConfigParser()
config.read(path)

model = PPO2.load(config['MAIN']['Model'])

obs = env.reset()
for i in range(int(config['MAIN']['TestSteps'])):
    print(i)
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    env.render()

env.env_method("save_results")
Example #12
0
def test_agent(agent_step):
    now = time.time()
    for coef_index in range(len(CLAC_COEFS)):
        mut_coef = CLAC_COEFS[coef_index]
        ent_coef = SAC_COEFS[coef_index]
        training_timestep = 0

        if(agent_step == 1):
            print(mut_coef,  "  ",  ent_coef, "  ", NUM_TRAINING_STEPS, "  ",  ENVIRONMENT_NAME, "  ", FOLDER)
        
        features = pd.DataFrame()
        
        clac_env = gym.make(ENVIRONMENT_NAME)
        clac_env = DummyVecEnv([lambda: clac_env])
        clac_model = CLAC(CLAC_MlpPolicy, clac_env, mut_inf_coef=mut_coef, verbose=1)

        sac_env = gym.make(ENVIRONMENT_NAME)
        sac_env = DummyVecEnv([lambda: sac_env])

        sac_model = SAC(MlpPolicy, sac_env, ent_coef=ent_coef, verbose=1)

        mirl_env = gym.make(ENVIRONMENT_NAME)
        mirl_env = DummyVecEnv([lambda: mirl_env])

        mirl_model = CLAC(CLAC_MlpPolicy, mirl_env, mut_inf_coef=mut_coef, coef_schedule=3.3e-3, verbose=1)

        (clac_model, learning_results) = clac_model.learn(total_timesteps=NUM_TRAINING_STEPS, log_interval=1000)
        learning_results['AgentID'] = agent_step
        learning_results.to_pickle(FOLDER + "/Training/results/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_0.pkl")

        (sac_model, learning_results) = sac_model.learn(total_timesteps=NUM_TRAINING_STEPS, log_interval=1000)
        learning_results['AgentID'] = agent_step
        learning_results.to_pickle(FOLDER +  "/Training/results/SAC_"+ str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_0.pkl")

        (mirl_model, learning_results) = mirl_model.learn(total_timesteps=NUM_TRAINING_STEPS, log_interval=1000)
        learning_results['AgentID'] = agent_step
        learning_results.to_pickle(FOLDER + "/Training/results/MIRL_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_0.pkl")

        training_timestep += NUM_TRAINING_STEPS

        sac_env.env_method("set_features", env_features) 
        mirl_env.env_method("set_features", env_features) 

        #if(agent_step == 0):
        #    print(env_features)

        Power = env_features[0]
        Density = env_features[1]
        Friction = env_features[2]
        Gravity = env_features[3]

        d = {"Mut Coefficient":  mut_coef, "Ent Coefficient":  ent_coef, "Resample Step":resample_step, "Power": Power, "Density": Density, "Friction": Friction, "Gravity": Gravity}
        #d = {"Mut Coefficient":  mut_coef, "Resample Step":resample_step, "Power": Power, "Density": Density, "Friction": Friction, "Gravity": Gravity}
        features = features.append(d, ignore_index = True)

        # Train generalization 
        eval_results = eval_model(clac_model, env, "CLAC", mut_coef, NUM_TESTING_STEPS, training_timestep, 0)
        eval_results['AgentID'] = agent_step
        eval_results.to_pickle(FOLDER + "/Generalization/results/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_0.pkl")

        eval_results = eval_model(sac_model, env, "SAC", ent_coef, NUM_TESTING_STEPS, training_timestep, 0)
        eval_results['AgentID'] = agent_step
        eval_results.to_pickle(FOLDER + "/Generalization/results/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_0.pkl")

        eval_results = eval_model(mirl_model, env, "MIRL", mut_coef, NUM_TESTING_STEPS, training_timestep, 0)
        eval_results['AgentID'] = agent_step
        eval_results.to_pickle(FOLDER + "/Generalization/results/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_0.pkl")

        clac_model.save(FOLDER +  "/Training/models/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_0")
        sac_model.save(FOLDER + "/Training/models/SAC_" + str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_0")
        mirl_model.save(FOLDER + "/Training/models/MIRL_" + str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_0")
        #features.to_pickle(FOLDER +  "/features/features_" + str(agent_step) + "_" + str(mut_coef)  + "_" + str(ent_coef) + ".pkl")
        
        for resample_step in range(1, NUM_RESAMPLES):
            if(agent_step == 1):
                print(mut_coef,  "  ",  ent_coef, "  ", NUM_TRAINING_STEPS, "  ",  ENVIRONMENT_NAME, "  ", FOLDER, " resample step ", resample_step)
            
            (clac_model, learning_results) = clac_model.learn(total_timesteps=NUM_TRAINING_STEPS, reset_num_timesteps=False,  log_interval=1000)
            learning_results.to_pickle(FOLDER + "/Training/results/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl")

            (sac_model, learning_results) = sac_model.learn(total_timesteps=NUM_TRAINING_STEPS, reset_num_timesteps=False,  log_interval=1000)
            learning_results.to_pickle(FOLDER +  "/Training/results/SAC_"+ str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl")

            (mirl_model, learning_results) = mirl_model.learn(total_timesteps=NUM_TRAINING_STEPS, reset_num_timesteps=False,  log_interval=1000)
            learning_results.to_pickle(FOLDER + "/Training/results/MIRL_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl")

            training_timestep += NUM_TRAINING_STEPS

            clac_model.save(FOLDER +  "/Training/models/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step))
            sac_model.save(FOLDER + "/Training/models/SAC_" + str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step))
            mirl_model.save(FOLDER + "/Training/models/MIRL_" + str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step))

        #print(features)

        del sac_model
        del sac_env

        del clac_model
        del clac_env
        
        del mirl_model
        del mirl_env

    later = time.time()
    difference = int(later - now)
    print("Tested Agent Time: ", difference)
Example #13
0
parser.add_argument("-d","--hamiltonian_directory", help="Hamiltonian directory", default="")
parser.add_argument("--destructive", default=False, action='store_true', help='Whether or not to use destructive observation.')
args = parser.parse_args()


silent = True

from train import episode_length
experiment_name=sys.argv[1]
experiment_description="""Reward is the negative of the minimum energy at episode termination, with no episode termination if negative beta encountered"""


env = DummyVecEnv([lambda: env_generator(ep_len=episode_length, total_sweeps=episode_length*1, beta_init_function=lambda: 0.3)])
env = VecNormalize(env, norm_obs=False, norm_reward=False, training=False)

env.env_method('set_experiment_tag', indices=[0], tag=args.tag)
env.env_method('set_max_ep_length', indices=[0], max_ep_length=episode_length)
env.env_method("toggle_datadump_on", indices=[0])
env.env_method('init_HamiltonianGetter', indices=[0], phase='TEST', directory=args.hamiltonian_directory )



#Attempting to restore most recently saved model
print("!!!!!!!!!!!!!!!!!!!!!!!")
max_path = mostrecentmodification(os.path.join("./saves", args.tag))
print(f"   Attempting to restore model {max_path}")
model = PPO2.load(max_path, env=env, **model_args)
print("!!!!!!!!!!!!!!!!!!!!!!!")

env = model.get_env()
class Training: 

    def __init__(self): 
        self.best_mean_reward = 0 
        self.n_steps =0 
        self.stats = {"rewards": []} 
        self.i =   0

    def process_end_of_actor_activation(self):
        """ Applies runtime patches to the Stable Baselines source code in order to set the End of Actor Activation
        """
        supported_values = ["tanh", "cbv"]
        if self.args.activation_end_of_actor not in supported_values:
            raise RuntimeError(f"End of Actor Activation {self.args.activation_end_of_actor} not supported")
        if self.args.activation_end_of_actor == "cbv":
            apply_tanh_patch()

    def f_clw_set_interval(self, x): 
        """ Sets the interval related to which the checkpoints are saved  
        """
        logging.debug(f"Operation: SET, Key: self.interval, Value: {x}")
        self.interval = x

    def f_clr_get_interval(self): 
        """ Gets the interval related to which the checkpoints are saved  
        """
        return self.interval

    def f_clw_set_model(self, x): 
        """ Sets the model that is used for the training   
        """
        logging.debug(f"Operation: SET, Key: self.model, Value: {x['model_name']}")
        self.model = x['model']
        self.model_name = x['model_name']

    def f_clr_get_model(self): 
        """ Gets the model that is used for the training   
        """
        logging.debug(f"Operation: GET, Key: self.model, Value: {self.model_name}") 
        return self.model 

    def f_clr_get_feed_dict(self, model): 
        feed_dict = {model.actions: model.stats_sample['actions']}

        for placeholder in [model.action_train_ph, model.action_target, model.action_adapt_noise, model.action_noise_ph]:
            if placeholder is not None:
                feed_dict[placeholder] = model.stats_sample['actions']

        for placeholder in [model.obs_train, model.obs_target, model.obs_adapt_noise, model.obs_noise]:
            if placeholder is not None:
                feed_dict[placeholder] = model.stats_sample['obs']

        return feed_dict


    def f_cb_check_switch(self): 
        if self.sp_desc.get_is_switch_active() and not self.has_switched_training_mode and (self.n_steps / self.args.n_steps) > self.sp_desc.get_time_perc(): 
            if self.sp_desc.get_is_continuous(): 
                temp = "Continuous"
                for x in self.__envs_training: 
                    x.set_continuous(quadcopter=Quadcopter(T=self.tp_desc.qg_continuous.get_T_episode(), dt_commands=self.tp_desc.qg_continuous.get_dt_command(), dt=self.tp_desc.qg_continuous.get_dt()))
            else: 
                temp = "Episodic"
                for x in self.__envs_training: 
                    x.set_episodic(quadcopter=Quadcopter(T=self.tp_desc.qg_episodic.get_T_episode(), dt_commands=self.tp_desc.qg_episodic.get_dt_command(), dt=self.tp_desc.qg_episodic.get_dt()))
            logging.info(f"QUERY MODE GENERATION SWITCH HAPPENED, now it is {temp}")
            self.has_switched_training_mode = True

    def callback(self, _locals, _globals):
        self._debug_callback(model=_locals['self'], sim_time=self.i)
        self._callback_tf_log()
        if (self.n_steps + 1) % self.f_clr_get_interval() == 0:
            self.f_cb_check_switch()
            self.i += 1
            full_checkpoint_id = int(self.model_desc.get_checkpoint_id())+int(self.i)
            logging.info(f"Checkpoint ID: Internal={self.i}, Full={full_checkpoint_id}, n_timesteps: {self.n_steps}")
            temp=self._save_model_stable_baselines(model=_locals['self'], cp_id=full_checkpoint_id)
            self._save_model_sherlock(temp)

            if self.train_saver is not None: 
                self.train_saver.save(sess=self.model.sess, save_path=f"{self.args.log_dir_tensorboard}/cp", global_step=self.i)
            if(self.args.save_as_tf): 
                path_save_cp = os.path.join(self.args.log_dir_tensorboard, f"cp-{self.i}")
                print(f"Saving Tensorflow Checkpoint in {path_save_cp}")
                self._save_model(path_save_cp)

            evaluation = f_model_2_evaluation(model=_locals['self'], env=self.env_test)
            quadcopter = self.__envs_training[0].quadcopter
            temp_plot_fn = f_iofsw_eval_2_plot(
                evaluation=evaluation, checkpoint_id=full_checkpoint_id,
                iteration_time=0, plots_dir=self.args.plots_dir,
                saturated=quadcopter.saturated, not_saturated=quadcopter.not_saturated)
            self.stats['rewards'].append(evaluation['re'])

        self.n_steps += 1
        # Returning False will stop training early
        return True

    def _debug_callback(self, model, sim_time): 
        if(self.args.debug_is_active): 
            if(self.args.debug_model_describe): 
                print(self._describe_model())
            if(self.args.debug_try_save_all_vars): 
                tf_path = f"{self.args.models_dir}/tf_quadcopter-{self.i}-desc"
                if not os.path.exists(tf_path): os.mkdir(tf_path)
                tf_testname_model = "debug_vars_all.json"
                tf_full_path = tf_path + "/" + tf_testname_model
                res = ""
                for v in tf.get_default_graph().as_graph_def().node: 
                    res += f"{v.name}\n"
                print(f"Trying to save debug data in {tf_full_path}")
                with open(tf_full_path, "w") as f: 
                    f.write(self._describe_model())
            if(self.args.debug_try_save_trainable_vars): 
                tf_path = f"{self.args.models_dir}/tf_quadcopter-{self.i}-desc"
                if not os.path.exists(tf_path): os.mkdir(tf_path)
                tf_testname_model = "debug_vars_trainable.json"
                tf_full_path = tf_path + "/" + tf_testname_model
                res = ""
                for v in tf.trainable_variables(): 
                    res += f"{v.name}\n"
                print(f"Trying to save debug data in {tf_full_path}")
                with open(tf_full_path, "w") as f: 
                    f.write(self._describe_model())
            if(self.args.debug_try_save_graph): 
                tf_path = f"{self.args.models_dir}/tf_quadcopter-{self.i}-desc"
                if not os.path.exists(tf_path): os.mkdir(tf_path)
                tf_testname_model = "debug_graph.json"
                tf_full_path = tf_path + "/" + tf_testname_model
                graph = tf.get_default_graph().as_graph_def()
                json_graph = json_format.MessageToJson(graph)
                print(f"Trying to save debug data in {tf_full_path}")
                with open(tf_full_path, "w") as f: 
                    f.write(json_graph)
            if(self.args.debug_try_save_weights): 
                tf_path = f"{self.args.models_dir}/tf_quadcopter-{self.i}-desc"
                if not os.path.exists(tf_path): os.mkdir(tf_path)
                tf_testname_model = "debug_weights.json"
                tf_full_path = tf_path + "/" + tf_testname_model
                weights = tf.trainable_variables()
                weights_vals = tf.get_default_session().run(weights)
                print(dir(tf.get_default_session().graph))
                print(f"Trying to save debug data in {tf_full_path}")
                with open(tf_full_path, "w") as f: 
                    f.write(str(weights_vals))

            if self.args.debug_show_tensors_active: 
                ops = []
                for e in self.args.debug_show_tensors_list: 
                    temp = getattr(model, e)
                    ops.append(temp)
                values = model.sess.run(ops, feed_dict=f_fwtf_get_feed_dict(model))
                for v in values: 
                    print(f"v.shape = {v.shape}\nv.value={v}\n\n")





    
    def _save_model(self, export_dir): 
        builder = tf.saved_model.builder.SavedModelBuilder(export_dir) 
        builder.add_meta_graph_and_variables(self.model.sess, [tf.saved_model.tag_constants.TRAINING])
        builder.save()

    def _save_model_stable_baselines(self, model, cp_id): 
        # Evaluate policy training performance
        path = f"{self.args.models_dir}/quadcopter-{cp_id}{self.args.suffix}"
        logging.info(f"SAVING CURRENT MODEL, Model SAVED at {path}")
        model.save(path)
        return path + '.pkl'

    def _save_model_sherlock(self, filename): 
        output_filename = filename + '.sherlock'
        params = get_stable_baseline_file_params(filename)
        print(f"Saving Sherlock Format File {output_filename}")
        with open( output_filename, 'w' ) as file_ : 
            file_.write(architectures.export.get_sherlock_format(model_desc=self.model_desc, params=params))

    def _describe_model(self): 
        res = f"Model.Graph Type={type(self.model.graph)}\nContent={dir(self.model.graph)}\n\n\n"
        res += f"Analysing {len(tf.get_default_graph().as_graph_def().node)} nodes \n"
        res += f"Graph Def = {tf.get_default_graph().as_graph_def()}\n"
        res += f"---------\n"
        for v in tf.get_default_graph().as_graph_def().node: 
            res += f"{v.name}\n"
        res += f"-----------\n"
        return res

    def _get_action_noise(self, noise_dict, n_actions): 
        if noise_dict['name'] == 'OrnsteinUhlenbeck': 
            return OrnsteinUhlenbeckActionNoise(mean=float(noise_dict['mu'])*np.ones(n_actions), sigma=float(noise_dict['sigma']) * np.ones(n_actions))
        else: 
            raise RuntimeError(f"Unrecognized Noise Model {noise_dict['name']}")


    def _args2str(self,a): 
        return f"step={a.step}\n" \
               f"env={a.env}\n" \
               f"verbose={str(a.verbose)}\n" \
               f"save_plots={str(a.save_plots)}\n" \
               f"suffix={a.suffix}\n" \
               f"model={json.dumps(a.model)}\n" \
               f"activation={a.activation}\n" \
               f"action_noise={json.dumps(a.action_noise)}\n" \
               f"n_steps={a.n_steps}\n" \
               f"model_dir={a.models_dir}\n" \
               f"plots_dir={a.plots_dir}\n"

    def _get_plot_rewards(self): 
        fig=plt.figure("Rewards")
        plt.plot(self.stats["rewards"])
        fig.suptitle('Reward')
        plt.xlabel('time')
        plt.ylabel('reward')
        return plt

    def _write_graph_def_for_tb(self, graph_def, LOGDIR): 
        """ TODO: Remove 
        """
        train_writer = tf.summary.FileWriter(LOGDIR)
        train_writer.add_graph(graph_def)
        train_writer.flush()
        train_writer.close()


    @property
    def sb_tb_log_active(self):
        """ Returns if native Stable Baseline Logging is active
        """
        return self.args.logging['tensorflow']['stable_baselines_native']['active']

    @property
    def sb_tb_log_dir(self):
        """ Returns the Stable Baseline TF Log Dir 
        """
        return self.args.log_dir_tensorboard if self.sb_tb_log_active else None


    def f_clr_instantiate_model(self, m): 
        res_model = None
        model_name = m.get_model_name()
        if m.get_actor_feature_extractor_type() == 'standard':
            pk = dict(act_fun=activations[self.args.activation])
        else:
            pk = dict(act_fun=activations[self.args.activation], layers=m.get_actor_feature_extractor_architecture())
        model_params = {
            'policy': MlpPolicy,
            'env': self.env,
            'verbose': int(self.args.verbose),
            'policy_kwargs': pk,
            'tensorboard_log': self.sb_tb_log_dir,
            'full_tensorboard_log': self.sb_tb_log_active
        }
        if m.get_actor_feature_extractor_name() != 'mlp':
            raise NotImplementedError(f"Exporting Policy Type {model_desc.get_actor_feature_extractor_name()} is unsupported at the moment")
        if model_name == 'ddpg':
            algo = DDPG
            model_params['param_noise'] = self.param_noise
            model_params['action_noise'] = self.action_noise
            model_params['render_eval'] = True
            model_params['policy'] = ddpg_policies.MlpPolicy
        elif model_name == 'trpo':
            algo = TRPO
            model_params['policy'] = common.MlpPolicy
        elif model_name == 'ppo':
            algo = PPO2
            model_params['policy'] = common.MlpPolicy
        elif model_name == 'td3':
            algo = TD3
            model_params['policy'] = td3_MlpPolicy
        elif model_name == 'sac':
            algo = SAC
            model_params['policy'] = sac_MlpPolicy
        model = algo(**model_params)
        # Tensorboard #
        tf.io.write_graph(model.graph, self.args.log_dir_tensorboard, "model.pbtxt")
        if self.train_writer is not None: self.train_writer.add_graph(model.graph)
        if self.train_writer is not None: self.train_writer.flush()
        logging.info(f"Instantiated Model Name={res_model}, policy={type(model_params['policy'])}, pk={pk}")
        return {"model": model, "model_name": model_name.upper()}

    def f_clw_instantiate_envs(self): 
        """ Instantiate both the Training and Test Gym Env 
        - They provide the same dynamical model and the same reward 
        """
        temp = 'gym_quadcopter:quadcopter-v' + str(self.env_desc.get_env_id())
        # TODO FIXME: Some models cannot handle multiple envs.
        N = self.env_desc.get_n_envs()
        if N < 1: 
            raise RuntimeError(f"Got NumEnvs needs to be >=1 but got NumEnvs={N}")
        logging.info(f"[SETUP] Creating {N} Training Environments - START")

        # Instantiating all the Envs and storing them into a private var 
        self.__envs_training = [f_fwgym_get_env(
            env_id=temp, used_states=self.used_states, instance_index=i,
            query_classes=self.query_classes, query_class=self.query_class,
            params=self.args.training_params
        ) for i in range(N)]

        # Passing references to previously created envs 
        self.env = DummyVecEnv([lambda: self.__envs_training[i] for i in range(N)]) 
        logging.info(f"[SETUP] Creating {N} Training Environments - DONE")
        logging.info(f"[SETUP] Creating 1 Test Environments - START")
        self.env_test = f_fwgym_get_env(
            env_id=temp, used_states=self.used_states, instance_index=0,
            query_classes=self.query_classes, query_class=self.query_class,
            params=self.args.testing_params
        )
        logging.info(f"[SETUP] Creating 1 Test Environments - DONE")

    def f_clw_args_2_state(self, args): 
        """Initialize internal instance state 
        """
        self.model_desc = ModelDict(model_dict=self.args.model)
        self.env_desc = EnvDict(env_dict=self.args.env)
        self.tp_desc = TrainingParamsDict(tp_dict=self.args.training_params)
        self.sp_desc = SwitchParamsDict(self.tp_desc.get_switch_params()) 
        self.query_classes = self.args.query_classes
        self.query_class = self.args.query_class
        self.used_states = self.args.used_states
        self.train_writer = None
        self.param_noise = None

        self.f_clw_instantiate_envs()
        self.n_actions = self.env.action_space.shape[-1]

        self.action_noise = f_fwgym_get_action_noise(noise_dict=self.args.action_noise, n_actions=self.n_actions)
        self.has_switched_training_mode = False


    def f_fwtfw_init(self): 
        """Initialize TF Environment 
        """
        tfl.set_verbosity(tfl.ERROR)
        os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'


    def get_global_summary(self): 
        return {"ModelName": self.model_desc.get_model_name(), "Continuous": str(self.tp_desc.get_is_continuous()), "Total_Training_Iterations": self.args.n_steps, "Iterations_Per_Checkpoint": self.args.iterations_checkpoint, "Env" : { "ID" : self.env_desc.get_env_id(), "Num_Envs" : self.env_desc.get_n_envs() }}

    def _add_tf_logs(self):
        """Adds the additional Tensorflow Logs to the standard Stable Baselines ones
        """
        with self.model.graph.as_default():
            # Conditional Logging for Summary
            if self.args.logging['tensorflow']['summary']['active']:
                tf.summary.text('Env Summary', tf.convert_to_tensor(str(self.env)))
            
            # Conditional Logging for the Stable Baselines Tensors specified in the list
            if self.args.logging['tensorflow']['stable_baselines_tensors']['active']:
                for e in self.args.logging['tensorflow']['stable_baselines_tensors']['list']:
                    tf.summary.scalar(f"Custom_SB_Log/{e}", tf.reduce_mean(getattr(self.model, e)))

            # Conditional Logging for the Tensorflow Tensors specified in the list
            if self.args.logging['tensorflow']['tensorflow_tensors']['active']:
                for e in self.args.logging['tensorflow']['tensorflow_tensors']['list']:
                    tf.summary.histogram(f"Custom_TF_Log/{e}", tf.get_default_graph().get_tensor_by_name(e))

            # Conditional Logging for Quadcopter Framework Events
            if self.args.logging['tensorflow']['events']['active']:
                if 'on_step' in self.args.logging['tensorflow']['events']['list']:
                    tf.summary.text(f'EnvStep{self.n_steps}', tf.convert_to_tensor(self.env.env_method('get_on_step_log')))

            # Merge all of the added summaries 
            self.model.summary = tf.summary.merge_all()


    def _callback_tf_log(self):
        with self.model.graph.as_default():
            if self.args.logging['tensorflow']['events']['active']:
                if 'on_step' in self.args.logging['tensorflow']['events']['list']:
                    tf.summary.text('EnvStep', tf.convert_to_tensor(self.env.env_method('get_on_step_log')))
                    self.model.summary = tf.summary.merge_all()

    def run_training(self, args):
        """ Training Function
        """
        # Use standard log just for the initial setup
        # Set the log used during training
        self.args = args
        self.process_end_of_actor_activation()
        self.f_clw_args_2_state(args)
        logging.info(f"Train Arguments\n{self._args2str(self.args)}") 
        logging.info(f"Writing Tensorboard Log to {self.args.log_dir_tensorboard}")
        logging.info(f"Start training at {dt.now().strftime('%Y%m%d_%H%M')}")
        self.f_fwtfw_init()
        if self.model_desc.get_is_load():
            # TODO: Fix this part 
            path = self.model_desc.get_checkpoint_path()
            model_name = self.model_desc.get_model_name()
            logging.info(f"LOADING MODEL at {path}")
            if model_name == "ddpg":
                self.model = DDPG.load(path, self.env)
            elif model_name == "ppo":
                self.model = PPO2.load(path, self.env)
            elif model_name == "trpo":
                self.model = TRPO.load(path, self.env)
            elif model_name == "td3":
                self.model = TD3.load(path, self.env)
            elif model_name == 'sac':
                self.model = SAC.load(path, self.env)
        else:
            # the noise objects for DDPG
            self.f_clw_set_model(self.f_clr_instantiate_model(m=self.model_desc))

            
        self.f_clw_set_interval(self.args.iterations_checkpoint)

        if self.args.save_tf_checkpoint: 
            with self.model.graph.as_default(): 
                self.train_saver = tf.compat.v1.train.Saver()
        else: 
            self.train_saver = None 



        self.i = 0
        # Implemented in 
        # https://github.com/hill-a/stable-baselines/blob/master/stable_baselines/ddpg/ddpg.py#L807

        logging.info(f"GLOBAL SUMMARY: {self.get_global_summary()}")
        self._add_tf_logs()
        self.model.learn(total_timesteps=int(self.args.n_steps), callback=self.callback)
        logging.info(f"Training Finished after {self.n_steps} iterations saving {self.i} intermediate checkpoints")
        logging.info(f"Saving Final Model in Stable Baseline Checkpoint")
        temp=self._save_model_stable_baselines(model=self.model, cp_id="final")

        print(f"Exporting Actor from Final Model in Stable Baseline Checkpoint as Sherlock Format")
        self._save_model_sherlock(temp)

        if self.train_writer is not None: self.train_writer.close()

        plt = self._get_plot_rewards()
        now = dt.now() 
        plt.savefig(f"{self.args.plots_dir}/reward_{now.strftime('%Y%m%d_%H%M%S')}.png")
        return True
Example #15
0
    learning_results.to_pickle(FOLDER + "/results/CLAC_" +
                               str(mut_coef).replace(".", "p") + "_" +
                               str(agent_step) + "_0.pkl")

    (sac_model,
     learning_results) = sac_model.learn(total_timesteps=NUM_TRAINING_STEPS,
                                         log_interval=1000)
    learning_results['AgentID'] = agent_step
    learning_results.to_pickle(FOLDER + "/results/SAC_" +
                               str(ent_coef).replace(".", "p") + "_" +
                               str(agent_step) + "_0.pkl")

    for resample_step in range(1, NUM_RESAMPLES):
        # Set both environments to the same resampled values
        if (RANDOMIZATION_LEVEL == "Normal"):
            clac_env.env_method("randomize", 0)
        elif (RANDOMIZATION_LEVEL == "Extreme"):
            clac_env.env_method("randomize", 1)
        elif (RANDOMIZATION_LEVEL == "Test"):
            clac_env.env_method("randomize", -1)
        else:
            print("Error resampling unknown value: ", RANDOMIZATION_LEVEL)
            continue

        if (agent_step == 1):
            print(mut_coef, "  ", ent_coef, "  ", NUM_TRAINING_STEPS, "  ",
                  ENVIRONMENT_NAME, "  ", FOLDER, " resample step ",
                  resample_step)

        env_features = clac_env.env_method("get_features")[0]
        sac_env.env_method("set_features", env_features)
Example #16
0
    x = np.array(x_range)
    y = eval(formula)
    plt.plot(x, y)
    plt.show()


# The algorithms require a vectorized environment to run
env = DummyVecEnv([lambda: ProcessorEnv()])

model = PPO2(MlpPolicy, env, verbose=1, learning_rate=0.00025)
model.learn(total_timesteps=5000)
model.save("ppo2_microprocessor_4")
model = PPO2.load("ppo2_microprocessor_4")

# env = DummyVecEnv([lambda: ProcessorEnv(taskFile='data/example.xlsx')])
env = DummyVecEnv([lambda: ProcessorEnv()])

obs = env.reset()
print("^^^^^^^^^^^^^^^^^^^RESET")
# env.env_method('graphShow')
# for i in range(200):
done = False
while not done:
    env.render()
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    if done:
        print(info[0])
        writeOutputFile('out.csv', info[0]['info'])
        env.env_method('graphShow')
Example #17
0
    env = Monitor(env, log_dir, allow_early_resets=True)

    return env


if __name__ == '__main__':

    env = DummyVecEnv([
        lambda: env_generator(ep_len=episode_length,
                              total_sweeps=episode_length * 100,
                              beta_init_function=lambda: 1.4 * np.random.rand(
                              ) + 0.2)
    ])
    env = VecNormalize(env, norm_obs=False, norm_reward=False, training=True)

    env.env_method('set_experiment_tag', indices=[0], tag=args.tag)
    #env.env_method('init_HamiltonianGetter', indices=[0], phase='TRAIN')
    env.env_method('init_HamiltonianGetter',
                   indices=[0],
                   phase='WSC',
                   directory=args.hamiltonian_directory)
    env.env_method('set_max_ep_length',
                   indices=[0],
                   max_ep_length=episode_length)

    n_steps = 0
    best_mean_reward = -np.inf

    def callback(_locals, _globals):
        global n_steps, best_mean_reward
        if (n_steps) % 100 == 0:
Example #18
0
        T = env.get_attr('T')[0]
        model = DDPG(MlpPolicy, env, verbose=1)
        model.load(TEST_MODEL)
        delta = DeltaHedge()
        for i in range(cfg.test_times):
            # rl
            env.set_attr("b_rl", True)
            obs = env.reset()  # every time, create a new transaction
            naked_returns.append(naked(env))
            covered_returns.append(covered(env))
            for i in range(T):
                action, _states = model.predict(obs)
                obs, rewards, done, info = env.step(action)
                # env.render()
            rl_returns.append(env.get_attr('final_reward')[0])
            env.env_method('restart')  # only trace back to the initial state
            env.set_attr("b_rl", False)
            # delta
            for i in range(T):
                action = delta.make_decision(env)
                obs, rewards, done, info = env.step(action)
                # env.render()
            delta_returns.append(env.get_attr('final_reward')[0])
        print("naked:", naked_returns)
        print("covered:", covered_returns)
        print("rl:", rl_returns)
        print("delta:", delta_returns)

    else:
        # load data
        df_train, df_test, df_rate = load_data(cfg)
class PPO2Agent:
    def __init__(self, base_env, subproc=True, envs=64, ):
        self.base_env = base_env
        self.subproc = subproc
        if subproc:
            envs = multiprocessing.cpu_count() if envs is None else envs
            self.env = SubprocVecEnv([lambda: base_env for _ in range(envs)])
        else:
            self.env = DummyVecEnv([lambda: self.base_env])
        self.model = None

    def load_model(self, path, model_name):
        self.model = PPO2.load(path, self.env)
        self.env.env_method("load_normalization_info", model_name=model_name)

    def save_model(self, path="ppo2_simple_robot"):
        if self.model is None:
            raise AssertionError("Model does not exist- cannot be saved.")
        self.model.save(path)

    def new_model(self, policy=MlpPolicy, gamma=0.99, batch_size=128):
        self.model = PPO2(policy, self.env, verbose=1, gamma=gamma, n_steps=batch_size)

    def learn(self, timesteps, learning_handler, checkpoint_interval=1000, path=None, learning_rate=0.00025,
              curiosity_path=None, batch_size=128):
        curiosity = curiosity_path is not None
        self.model.learning_rate = learning_rate
        if self.model is None:
            self.new_model(batch_size=batch_size)
        if checkpoint_interval is not None:
            for checkpoint in range(int(timesteps / checkpoint_interval)):
                cb = learning_handler.get_learn_callback(checkpoint * checkpoint_interval, curiosity=curiosity,
                                                         subproc=self.subproc, batch_size=batch_size)
                self.model.learn(total_timesteps=checkpoint_interval, callback=cb, reset_num_timesteps=False)
                self.save_model(path)
                if curiosity:
                    self.base_env.curiosity_module.save_forward(curiosity_path)

                matplotlib.use('Agg')
                m = learning_handler.model_storage.get_model(learning_handler.model_name)
                learning_handler.save_plot(m['realtime_data']['plot_path'], real_time=True, curiosity=curiosity)
                learning_handler.save_plot(m['timestep_data']['plot_path'], real_time=False, curiosity=curiosity)
        else:
            cb = learning_handler.get_learn_callback(curiosity=curiosity, subproc=self.subproc)
            self.model.learn(total_timesteps=timesteps, callback=cb, reset_num_timesteps=False)
            self.save_model(path)

    def demo(self, timestep_sleep=0):
        obs = self.base_env.reset()
        while True:
            action, _states = self.model.predict(obs)
            obs, _, done, info = self.base_env.step(action, render=True)
            self.base_env.render()
            time.sleep(timestep_sleep)
            if done:
                obs = self.base_env.reset()

    def validate(self, n_episodes):
        obs = self.base_env.reset()
        ep_histories = None
        for i in range(n_episodes):
            ep_history = []
            while True:
                action, _states = self.model.predict(obs)
                obs, reward, done, info = self.base_env.step(action)
                ep_history.append(info['distance'])
                if done:
                    if ep_histories is None:
                        ep_histories = np.array([ep_history])
                    else:
                        ep_histories = np.concatenate((ep_histories, [ep_history]))
                    obs = self.base_env.reset()
                    break
        return ep_histories
Example #20
0
def run_matchup(drafter1: str, drafter2: str, battler: str, games: int,
                seed: int, concurrency: int) \
        -> Tuple[Tuple[float, float], Tuple[list, list], Tuple[list, list], List[List[Tuple]], Tuple[list, list], List[float]]:
    """
    Run the match-up between `drafter1` and `drafter2` using `battler` battler
    :param drafter1: drafter to play as first player
    :param drafter2: drafter to play as second player
    :param battler: battler to simulate the matches
    :param games: amount of matches to simulate
    :param seed: seed used to generate the matches
    :param concurrency: amount of matches executed at the same time
    :return: a tuple containing (i) a tuple containing the win rate of the
    first and second players, (ii) a tuple containing the average mana curves
    of the first and second players, (iii) a tuple containing the
    `30 * games` individual draft choices of the first and second players;
    (iv) a tuple of 3-uples containing the card alternatives presented to the
    players at each of the `games` episodes; and (v) a tuple containing the
    `games` decks built by the first and second players.
    """
    # parse the battle agent
    battler = agents.parse_battle_agent(battler)

    # initialize envs
    env = [lambda: LOCMDraftEnv(battle_agents=(battler(), battler())) for _ in range(concurrency)]

    # wrap envs in a vectorized env
    env = DummyVecEnv(env)

    for i in range(concurrency):
        # no overlap between episodes at each process
        current_seed = seed + (games // concurrency) * i
        current_seed -= 1  # resetting the env increases the seed by 1

        # set seed to env
        env.env_method('seed', current_seed, indices=[i])

    # reset the env
    env.reset()

    # initialize first player
    if drafter1.endswith('zip'):
        current_drafter = agents.RLDraftAgent(PPO2.load(drafter1))
        current_drafter.use_history = "history" in drafter1
    else:
        current_drafter = agents.parse_draft_agent(drafter1)()

    current_drafter.seed(seed)
    current_drafter.name = drafter1
    drafter1 = current_drafter

    # initialize second player
    if drafter2.endswith('zip'):
        other_drafter = agents.RLDraftAgent(PPO2.load(drafter2))
        other_drafter.use_history = "history" in drafter2
    else:
        other_drafter = agents.parse_draft_agent(drafter2)()

    other_drafter.seed(seed)
    other_drafter.name = drafter2
    drafter2 = other_drafter

    # initialize metrics
    episodes_so_far = 0
    episode_rewards = [[0.0] for _ in range(env.num_envs)]
    drafter1.mana_curve = [0 for _ in range(13)]
    drafter2.mana_curve = [0 for _ in range(13)]
    drafter1.choices = [[] for _ in range(env.num_envs)]
    drafter2.choices = [[] for _ in range(env.num_envs)]
    drafter1.decks = [[[]] for _ in range(env.num_envs)]
    drafter2.decks = [[[]] for _ in range(env.num_envs)]
    alternatives = [[] for _ in range(env.num_envs)]

    # run the episodes
    while True:
        observations = env.get_attr('state')

        # get the current agent's action for all concurrent envs
        if isinstance(current_drafter, agents.RLDraftAgent):
            all_past_choices = env.get_attr('choices')
            new_observations = []

            for i, observation in enumerate(observations):
                new_observation = encode_state_draft(
                    observation,
                    use_history=current_drafter.use_history,
                    past_choices=all_past_choices[i][observation.current_player.id]
                )

                new_observations.append(new_observation)

            actions = current_drafter.act(new_observations)
        else:
            actions = [current_drafter.act(observation)
                       for observation in observations]

        # log chosen cards into current agent's mana curve
        for i, (action, observation) in enumerate(zip(actions, observations)):
            # get chosen index
            try:
                chosen_index = action.origin
            except AttributeError:
                chosen_index = action

            # save choice
            current_drafter.choices[i].append(chosen_index)

            # get chosen card
            chosen_card = observation.current_player.hand[chosen_index]

            # increase amount of cards chosen with the chosen card's cost
            current_drafter.mana_curve[chosen_card.cost] += 1

            # add chosen card to this episode's deck
            current_drafter.decks[i][-1].append(chosen_card.id)

            # save card alternatives
            if observation.current_player.id == PlayerOrder.FIRST:
                alternatives[i].append(tuple(map(lambda c: c.id, observation.current_player.hand)))

        # perform the action and get the outcome
        _, rewards, dones, _ = env.step(actions)

        if isinstance(current_drafter, agents.RLDraftAgent):
            current_drafter.dones = dones

        # update metrics
        for i in range(env.num_envs):
            episode_rewards[i][-1] += rewards[i]

            if dones[i]:
                episode_rewards[i].append(0.0)
                current_drafter.decks[i].append([])
                other_drafter.decks[i].append([])

                episodes_so_far += 1

        # check exiting condition
        if episodes_so_far >= games:
            break

        # swap drafters
        current_drafter, other_drafter = other_drafter, current_drafter

    # normalize mana curves
    total_choices = sum(drafter1.mana_curve)
    drafter1.mana_curve = [freq / total_choices for freq in drafter1.mana_curve]
    drafter2.mana_curve = [freq / total_choices for freq in drafter2.mana_curve]

    # join all parallel rewards
    all_rewards = [reward for rewards in episode_rewards
                   for reward in rewards[:-1]]

    # join all parallel choices
    drafter1.choices = [c for choices in drafter1.choices for c in choices]
    drafter2.choices = [c for choices in drafter2.choices for c in choices]

    # join all parallel decks
    drafter1.decks = [deck for decks in drafter1.decks for deck in decks if deck]
    drafter2.decks = [deck for decks in drafter2.decks for deck in decks if deck]

    # join all parallel alternatives
    alternatives = [turn for env in alternatives for turn in env]

    # cap any unsolicited data from additional episodes
    all_rewards = all_rewards[:games]
    drafter1.choices = drafter1.choices[:30 * games]
    drafter2.choices = drafter2.choices[:30 * games]
    drafter1.decks = drafter1.decks[:games]
    drafter2.decks = drafter2.decks[:games]
    alternatives = alternatives[:30 * games]

    # convert the list of rewards to the first player's win rate
    win_rate = (mean(all_rewards) + 1) * 50

    return (win_rate, 100 - win_rate), \
        (drafter1.mana_curve, drafter2.mana_curve), \
        (drafter1.choices, drafter2.choices), \
        alternatives, \
        (drafter1.decks, drafter2.decks), \
        all_rewards
Example #21
0
def test_agent(agent_step):
    now = time.time()
    for coef_index in range(len(CLAC_COEFS)):

        mut_coef = CLAC_COEFS[coef_index]
        ent_coef = SAC_COEFS[coef_index]
        training_timestep = 0

        clac_env = gym.make(ENVIRONMENT_NAME)
        clac_env = DummyVecEnv([lambda: clac_env])
        clac_model = CLAC(CLAC_MlpPolicy, clac_env, mut_inf_coef=mut_coef, verbose=1)

        sac_env = gym.make(ENVIRONMENT_NAME)
        sac_env = DummyVecEnv([lambda: sac_env])

        sac_model = SAC(MlpPolicy, sac_env, ent_coef=ent_coef, verbose=1)

        mirl_env = gym.make(ENVIRONMENT_NAME)
        mirl_env = DummyVecEnv([lambda: mirl_env])

        mirl_model = CLAC(CLAC_MlpPolicy, mirl_env, mut_inf_coef=mut_coef, coef_schedule=3.3e-3, verbose=1)
        
        for resample_step in range(0, NUM_RESAMPLES):
            features = pd.DataFrame()

            if(agent_step == 1):
                print(mut_coef,  "  ",  ent_coef, "  ", NUM_TRAINING_STEPS, "  ",  ENVIRONMENT_NAME, "  ", FOLDER, " ", resample_step)

            (clac_model, learning_results) = clac_model.learn(total_timesteps=NUM_TRAINING_STEPS, log_interval=1000)
            (sac_model, learning_results) = sac_model.learn(total_timesteps=NUM_TRAINING_STEPS, log_interval=1000)
            (mirl_model, learning_results) = mirl_model.learn(total_timesteps=NUM_TRAINING_STEPS, log_interval=1000)

            # Save models 
            clac_model.save(FOLDER + "/Training/models/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step))
            sac_model.save(FOLDER + "/Training/models/CLAC_" + str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step))
            mirl_model.save(FOLDER + "/Training/models/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step))

            training_timestep += NUM_TRAINING_STEPS

            # Test Normal 
            eval_results = eval_model(clac_model, clac_env, "CLAC", mut_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 0)
            eval_results.to_pickle(FOLDER + "/Training/results/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl")

            eval_results = eval_model(sac_model, sac_env, "SAC", ent_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 0)
            eval_results['AgentID'] = agent_step
            eval_results.to_pickle(FOLDER + "/Training/results/SAC_" + str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl")

            eval_results = eval_model(mirl_model, mirl_env, "MIRL", mut_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 0)
            eval_results['AgentID'] = agent_step
            eval_results.to_pickle(FOLDER + "/Training/results/MIRL_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl")

            # Test generalization 
            eval_results = eval_model(clac_model, clac_env, "CLAC", mut_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 1)
            eval_results['AgentID'] = agent_step
            eval_results.to_pickle(FOLDER + "/Generalization/results/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl")

            eval_results = eval_model(sac_model, sac_env, "SAC", ent_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 1)
            eval_results['AgentID'] = agent_step
            eval_results.to_pickle(FOLDER + "/Generalization/results/SAC_" + str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl")

            eval_results = eval_model(mirl_model, mirl_env, "MIRL", mut_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 1)
            eval_results['AgentID'] = agent_step
            eval_results.to_pickle(FOLDER + "/Generalization/results/MIRL_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl")

            # Test generalization Extreme
            eval_results = eval_model(clac_model, clac_env, "CLAC", mut_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 2)
            eval_results['AgentID'] = agent_step
            eval_results.to_pickle(FOLDER + "/Extreme/results/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl")

            eval_results = eval_model(sac_model, sac_env, "SAC", ent_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 2)
            eval_results['AgentID'] = agent_step
            eval_results.to_pickle(FOLDER + "/Extreme/results/SAC_" + str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl")

            eval_results = eval_model(mirl_model, mirl_env, "MIRL", mut_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 2)
            eval_results['AgentID'] = agent_step
            eval_results.to_pickle(FOLDER + "/Extreme/results/MIRL_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl")

            clac_env.env_method("reset_features")
            sac_env.env_method("reset_features")
            mirl_env.env_method("reset_features")
        
        del sac_model
        del sac_env

        del clac_model
        del clac_env
        
        del mirl_model
        del mirl_env

    later = time.time()
    difference = int(later - now)
    print("Tested Agent Time: ", difference)
Example #22
0
def baseline(num_hamiltonians=20, num_trials=10):
    env = DummyVecEnv([
        lambda: env_generator(ep_len=args.episode_length,
                              total_sweeps=args.total_sweeps)
    ])

    env.env_method('set_experiment_tag', indices=[0], tag=args.tag)
    env.env_method('set_max_ep_length',
                   indices=[0],
                   max_ep_length=args.episode_length)
    env.env_method('init_HamiltonianGetter',
                   indices=[0],
                   phase='TEST',
                   directory=args.hamiltonian_directory)

    env.env_method("init_HamiltonianSuccessRecorder",
                   indices=[0],
                   num_hamiltonians=num_hamiltonians,
                   num_trials=num_trials)
    env.env_method("set_static_Hamiltonian_by_ID", indices=[0], ID=0)

    obs = env.reset()

    test_ep = -1
    for ham in range(num_hamiltonians):
        env.env_method("set_static_Hamiltonian_by_ID", indices=[0], ID=ham)
        for trial in range(num_trials):
            test_ep += 1
            state = None
            done = [False for _ in range(env.num_envs)]
            step = -1
            print("beta=", env.env_method('get_current_beta', indices=[0])[0])
            while True:
                step += 1
                obs, reward, d, _ = env.step(np.array([dbetas[step]]))
                if d:
                    break
        env.env_method("hsr_write")
Example #23
0
if PHASE == 'VALUE_ANALYSIS':
    model_args["learning_rate"] = 0.0000000000
    model_args["noptepochs"] = 1

if __name__ == '__main__':

    env = DummyVecEnv([
        lambda: env_generator(ep_len=episode_length,
                              total_sweeps=episode_length * 1,
                              beta_init_function=lambda: 2 * np.random.rand() +
                              0.333)
    ])
    env = VecNormalize(env, norm_obs=False, norm_reward=False, training=True)

    env.env_method('set_experiment_tag', indices=[0], tag=args.tag)
    if PHASE == 'VALUE_ANALYSIS' or PHASE == 'ISING':
        env.env_method('init_HamiltonianGetter',
                       indices=[0],
                       phase=PHASE,
                       directory=args.hamiltonian_directory)
    elif PHASE == 'TRAIN':
        env.env_method('init_HamiltonianGetter', indices=[0], phase='TRAIN')
    env.env_method('set_max_ep_length',
                   indices=[0],
                   max_ep_length=episode_length)

    if args.destructive:
        env.env_method('set_destructive_observation_on', indices=[0])

    n_steps = 0
Example #24
0
class LagrangianCMDPSolver(CMDPSolverBase):
    """
    Class to solve CMDP with Lagrangian method.

    The method we use is bases on "Batch policy learning under constraints"
    by Le et al. The constrained MDP is addressed by solving a sequence of
    unconstrained MDPs. In particular, we alternate between a best response (BR)
    algorithm that solves the unconstrained problem deriving from fixing the
    value of the Lagrange multipliers and an online optimization algorithm
    that sets the multipliers based on the performance of the BR.
    """

    # TODO: Estimate the duality gap for stopping

    def __init__(self, env, br_algo, online_algo, br_kwargs=None, online_kwargs=None, _init_setup_model=True,
                 lagrangian_ronuds=10, log_training=False,
                 br_uses_vec_env=False, n_envs=1, use_sub_proc_env=True):
        """
        
        Parameters
        ----------
        env: src.envs.CMDP or None
        br_algo: stable baselines algorithm class
            Best response algorithm
        online_algo: src.online
            Online optimization algorithm class
        br_kwargs: dict
            Keyword arguments for best response
        online_kwargs: dict
            Keyword arguments for online opt algorithm
        _init_setup_model: bool
            Whether to set up the br and online upon initialization
        lagrangian_ronuds: int
            Number of times we alternate between br and online
        log_training: bool  
            Whether to log episode rewards and constraints during training
        br_uses_vec_env: bool
            Whether br algorithms needs a vectorized environment            
        n_envs: int 
            Number of environments to use (only relevant for vectorized case)
        use_sub_proc_env: bool
            Whether to use subprocesses for vectorized env (otherwise dummy 
            vec is used)
        """
        self.br_algo = br_algo
        self.online_algo = online_algo
            
        self.br_kwargs = {} if br_kwargs is None else br_kwargs
        online_kwargs = {} if online_kwargs is None else online_kwargs
        self.online_kwargs = online_kwargs.copy()
        
        # Initialize placeholders to fill when setting the environment and 
        # the model
        self.br = None
        self.online = None
        self.unconstrainedMDP = None  # The MDP resulting from Lagrangian ofCMDP
        
        self._env = None
        self.observation_space = None
        self.action_space = None
        self.env_generator = None
        self.lagrangian_rounds = lagrangian_ronuds
        self._log_training = log_training
        self.training_rewards = None
        self.training_constraints = None
        
        # Vectorized environment arguments        
        self.br_uses_vec_env = br_uses_vec_env
        self.use_sub_proc_env = use_sub_proc_env
        self.n_envs = n_envs

        self.set_env(env)

        if _init_setup_model:
            self.setup_model()

    def set_unconstrainedMDP(self):
        """
        Set up the unconstrained Lagrangian MDP.

        It can be set up either as a normal environment, a dummy vecotrized
        environment or a multiprocessing vectorized environment
        """
        assert self.online is not None, 'Need a value for Lagrange ' \
                                        'multipliers to initialize the ' \
                                        'unconstrained MDP'

        if self.br_uses_vec_env:
            # The function that generate the Lagrangian environment needs to
            # be outside the class to avoid pickling errors with
            # multiprocessing
            lagrangian_env = partial(get_lagrangian_env,
                                     cenv=None, # Passing _env here is not  necessary and slows down serialization a lot
                                     w=self.online.w,
                                     cenv_gen=self.env_generator)
            assert self.env_generator is not None, \
                'Environment generator is necessary for vectorized env'

            # With subprocesses for env
            if self.use_sub_proc_env:
                self.unconstrainedMDP = SubprocVecEnv(
                        [lagrangian_env for _ in range(self.n_envs)])

            # With dummy vec env
            else:
                self.unconstrainedMDP = DummyVecEnv(
                    [lagrangian_env for _ in range(self.n_envs)])
        else:
            lagrangian_env = partial(get_lagrangian_env,
                                     cenv=self._env,
                                     w=self.online.w,
                                     cenv_gen=self.env_generator)
            self.unconstrainedMDP = lagrangian_env()

    def _initialize_online(self):
        if self._env is not None:
            d = self._env.n_constraints + 1
            self.online_kwargs.update({'d': d})
            self.online = self.online_algo(**self.online_kwargs)
        else:
            print('Skipping online initialization since there is no env')

    def update_online(self, keep_multipliers=False):
        """
        Update online optimization algorithm.
        """
        if self.online is not None and keep_multipliers and \
                self._env.n_constraints + 1 == len(self.online.w):
            pass
        else:
            self._initialize_online()

    def setup_model(self):
        """
        Set best response.
        """
        if self.unconstrainedMDP is None:
            self.br = None
        else:
            br_kwargs = self.br_kwargs.copy()
            br_kwargs.update({'env': self.unconstrainedMDP})
            self.br = self.br_algo(**br_kwargs)


    def _setup_learn(self, seed):
        """
        check the environment, set the seed, and set the logger

        Parameters
        ----------
        seed: int
            The seed value
        """
        if self._env is None:
            raise ValueError("Error: cannot train the model without a valid environment, please set an environment with"
                             "set_env(self, env) method.")
        if seed is not None:
            set_global_seeds(seed)

    def learn(self, total_timesteps, seed=None, log=False):
        """
        Solve the CMDP alternating BR and online algorithm.

        Parameters
        ----------
        total_timesteps: int
            Total number of timesteps the algorithm is run for. Each
            Lagrangian round (i.e. alternation of br and online) is run to
            total_timesteps/self.lagrangian_rounds.
        seed: int or None
            The random seed
        log: Bool
            Print to screen some statistics about the BR training.

        Returns
        -------
        R: float
            Return when evaluating the policy learned by BR in last
            Lagrangian round
        G: np.ndarray
            Constraint when evaluating the policy learned by BR in last
            Lagrangian round
        w: np.ndarray
            Lagrange multipliers
        """

        self._setup_learn(seed)

        if total_timesteps < self.lagrangian_rounds:
            raise ValueError("There should be more time steps than Lagrangian rounds")

        # Number of timesteps per Lagrangian round
        br_time_steps = np.full(self.lagrangian_rounds, int(total_timesteps / self.lagrangian_rounds))
        br_time_steps[-1] += np.mod(total_timesteps, self.lagrangian_rounds)

        # Alternate between br and online
        for ts in br_time_steps:

            # Reset the monitor that tracks the performance of BR on the
            # unconstrained Lagrangian MDP (constraint violation is also
            # tracked)
            if self.br_uses_vec_env:
                self.unconstrainedMDP.env_method('reset_monitor')
            else:
                self.unconstrainedMDP.reset_monitor()
            self.br._init_num_timesteps()  # Reset exploration schedule

            # Train BR on unconstrained MDP
            if log:
                self.br.learn(ts, log_interval=ts)
            else:
                self.br.learn(ts, log_interval=np.inf)

            # Get training performance
            if self.br_uses_vec_env:
                # Get reward and constraints from all envs
                r_tmp = self.unconstrainedMDP.env_method(
                    'get_episode_rewards')
                g_tmp = self.unconstrainedMDP.env_method(
                    'get_episode_constraints')
                current_rewards = np.concatenate(r_tmp)
                current_constraints = np.concatenate(g_tmp)
            else:
                current_rewards = \
                    self.unconstrainedMDP.get_episode_rewards()
                current_constraints = \
                    self.unconstrainedMDP.get_episode_constraints()

            R = np.mean(current_rewards)
            G = np.mean(current_constraints, axis=0)

            # Log info about training
            if self._log_training:
                if self.training_rewards is None:
                    self.training_rewards = np.copy(current_rewards)
                else:
                    self.training_rewards = np.hstack((
                        self.training_rewards, current_rewards))
                # self.training_rewards.append(list(current_rewards))
                if self.training_constraints is None:
                    self.training_constraints = np.copy(current_constraints)
                else:
                    self.training_constraints = np.vstack((
                        self.training_constraints, current_constraints))

            # evaluate performance may be necessary for off-policy methods
            # where the deployed policy is different from the one that
            # collects data (in that case, it would make sense to adjust the
            # multipliers according to the optimized policy and not the
            # exploratory one)
            # R, G = self.evaluate_performance(int(0.2 * ts), min_episodes=5)


            # print('Evaluation r:{}\tEvaluation g {}'.format(R, G))

            # Online algorithm updates multipliers based on BR performance
            self.online.step(-np.append(G, 0))

            # Set new multipliers
            if self.br_uses_vec_env:
                self.unconstrainedMDP.set_attr('lam', self.online.w[:-1])
            else:
                self.unconstrainedMDP.lam = self.online.w[:-1]

        return R, G, self.online.w

    def predict(self, observation, state=None, mask=None, deterministic=True):
        """
        Get the best response action from an observation
        """
        if self.br is not None:
            return self.br.predict(observation, state, mask, deterministic)
        else:
            raise ValueError('Need a valid environment to setup learner and predict its action')

    def action_probability(self, observation, state=None, mask=None, actions=None, logp=False):
        if self.br is not None:
            return self.br.action_probability(observation, state, mask, actions, logp)
        else:
            raise ValueError('Need a valid environment to setup learner and predict its action probabilities')

    def evaluate_performance(self, min_steps, min_episodes):
        """
        Deploy policy learned by BR to evaluate its performance in terms of
        return and constraint violation.

        Parameters
        ----------
        min_steps: int
            Minimum number of steps that we run the environment for
        min_episodes: int
            Minimum number of episodes

        Returns
        -------
        R: float
            Average return across episodes
        G: np.ndarray
            Average constraint value across episods

        """
        if self.unconstrainedMDP is None:
            raise ValueError('Cannot reset monitor without a valid environment')

        n_episodes = 0
        n_steps = 0
        max_steps = min_steps * 5 #  Fix a timeout

        # TODO: If we move to subproc env, we should aim to use the
        #  vectorized env properly here

        if self.br_uses_vec_env:
            # This is equivalent to the non-vectorized case since we operate
            # only on one env. However, we still need to use the vectorized
            # env interface to access the individual attributes and methods.

            # Reser monitor and env
            self.unconstrainedMDP.env_method('reset_monitor')
            obs = self.unconstrainedMDP.env_method('reset', indices=0)[0]

            # Run env
            while (n_episodes < min_episodes or n_steps < min_steps) and not n_steps > max_steps:
                action, _ = self.br.predict(obs, deterministic=True)
                obs, reward, done, info = self.unconstrainedMDP.env_method(
                    'step', action, indices=0)[0]

                if done:
                    n_episodes += 1
                    obs = self.unconstrainedMDP.env_method('reset',
                                                           indices=0)[0]
                n_steps += 1

            # Compute return and contraint
            R = np.mean(self.unconstrainedMDP.env_method(
                'get_episode_rewards', indices=0)[0])
            G = np.mean(self.unconstrainedMDP.env_method(
                'get_episode_constraints', indices=0)[0], axis=0)
        else:
            # Reser monitor and env
            self.unconstrainedMDP.reset_monitor()
            obs = self.unconstrainedMDP.reset()

            # Run env
            while (n_episodes < min_episodes or n_steps < min_steps) and not n_steps > max_steps:
                action, _ = self.br.predict(obs, deterministic=True)
                obs, reward, done, info = self.unconstrainedMDP.step(action)

                if done:
                    n_episodes += 1
                    obs = self.unconstrainedMDP.reset()
                n_steps += 1

            # Compute return and contraint
            R = np.mean(self.unconstrainedMDP.get_episode_rewards())
            G = np.mean(self.unconstrainedMDP.get_episode_constraints(), axis=0)

        return R, G

    def set_env(self, env, keep_multipliers=False, reset_br=False):
        """
        Set a new environment.

        Parameters
        ----------
        env: src.envs.CMDP
        keep_multipliers: bool
        setup_model: bool
        """
        # Clean up resources if vectorized env already exists

        if isinstance(self.unconstrainedMDP, (DummyVecEnv, SubprocVecEnv)):
            self.unconstrainedMDP.close()

        # For vectorized environment we need an environment generating
        # function, otherwise we can simply set the env
        if self.br_uses_vec_env:
            if env is not None:
                assert callable(env), 'An environments generating callable is ' \
                                      'necessary for algorithms requiring a ' \
                                      'vectorized environment'

                # If necessary, this extra copy of the env can be removed.
                # Need to check all the places where _env is accessed and
                # modify them.
                super().set_env(env())
                self.env_generator = env
        else:
            super().set_env(env)
            self.env_generator = None  # Not needed in non-vectorized case

        if self.get_env() is not None:
            self.update_online(keep_multipliers)
            self.set_unconstrainedMDP()
            if reset_br or self.br is None:
                self.setup_model()
            self.br.set_env(self.unconstrainedMDP)

        self.training_rewards = None
        self.training_constraints = None

    def get_env(self):
        return super().get_env()

    def set_multipliers(self, w):
        if self.online is not None:
            if len(w) != len(self.online.w):
                raise ValueError('Multipliers must have the same length. Old ones have length {}, while new ones have '
                                 'length {}'.format(len(self.online.w), len(w)))
            else:
                self.online.w = w
        else:
            warnings.warn('There is no online algorithm to set the multipliers for')

    def get_multipliers(self):
        return self.online.w

    def get_br_params(self):
        return self.br.get_parameters()

    def set_br_params(self, params):
        self.br.load_parameters(params)

    def get_params(self):
        params = self.get_br_params()
        multipliers = self.get_multipliers()
        params.update({'multipliers': multipliers})
        return params

    def set_params(self, params):
        multipliers = params['multipliers']
        self.set_multipliers(multipliers)

        del params['multipliers']
        self.set_br_params(params)

    def get_training_performance(self):
        if not self._log_training:
            warnings.warn('Log training is set to False and no data was logged')

        return self.training_rewards, self.training_constraints

    @property
    def log_training(self):
        return self._log_training

    @log_training.setter
    def log_training(self, value):
        self._log_training = bool(value)