Example #1
0
from examples.utils.utils import get_policy

tensorboard_folder = './tensorboard/Snake/base/'
model_folder = './models/Snake/base/'
if not os.path.isdir(tensorboard_folder):
    os.makedirs(tensorboard_folder)
if not os.path.isdir(model_folder):
    os.makedirs(model_folder)

policy = ''
model_tag = ''
if len(sys.argv) > 1:
    policy = sys.argv[1]
    model_tag = '_' + sys.argv[1]

env = DummyVecEnv([lambda: BaseEnv(10, 10)])

model = PPO2(get_policy(policy),
             env,
             verbose=0,
             nminibatches=1,
             tensorboard_log=tensorboard_folder)
model.learn(total_timesteps=10000000, tb_log_name='PPO2' + model_tag)

model.save(model_folder + "PPO2" + model_tag)
del model
model = PPO2.load(model_folder + "PPO2" + model_tag)

done = False
states = None
obs = env.reset()
    # Plot cumulative reward
    with open(os.path.join(log_dir, "monitor.csv"), 'rt') as fh:
        firstline = fh.readline()
        assert firstline[0] == '#'
        df = pd.read_csv(fh, index_col=None)['r']
    df.rolling(window=1000).mean().plot()
    plt.show()
    return model


if __name__ == '__main__':
    env = ConnectFourGym(agent2="random")
    log_dir = "ppo/"
    os.makedirs(log_dir, exist_ok=True)

    # Logging progress
    monitor_env = Monitor(env, log_dir, allow_early_resets=True)

    # Create a vectorized environment
    vec_env = DummyVecEnv([lambda: monitor_env])

    # Initialize agent
    model = get_model(vec_env)

    # Train agent
    model = train_model(model)

    env_game = make("connectx")
    env_game.run([agent1, "random"])
    get_win_percentages(agent1=agent1, agent2="random")
    # Load the learning parameters from a file.
    param_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                             'param_files')
    if args.param_file is None:
        default_path = os.path.join(param_dir, 'default_params.json')
        with open(default_path) as f:
            params = commentjson.load(f)[args.default_name]
    else:
        param_file = os.path.join(param_dir, args.param_file)
        with open(param_file) as f:
            params = commentjson.load(f)

    # Visualize.
    env_cls = globals()[params['env']]
    env = env_cls(**params['env_options'])
    vec_env = DummyVecEnv([lambda: env])

    # Collect the info keywords.
    if len(args.info_keywords):
        info_keywords = args.info_keywords.split(',')
    else:
        info_keywords = []

    # Report the data over a number of random initializations.
    iters = 5
    for i in range(iters):
        print('Iteration: {}'.format(i))

        # Create a random environment.
        if params['alg'] == 'PPO2':
            model = PPO2(params['policy_type'], vec_env,
Example #4
0
import gym
import numpy as np

from stable_baselines.sac.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import SAC

import gym_ur5_gripper

import ray
ray.init()

env = gym.make('UR5Gripper-v0')
env.render('human')
env = DummyVecEnv([lambda: env])

model = SAC(MlpPolicy, env, verbose=1)


@ray.remote()
def sac_learn():
    model.learn(total_timesteps=500000, log_interval=100)


sac_learn.remote()
model.save("sac_ur5_gripper")

del model  # remove to demonstrate saving and loading

model = SAC.load("sac_ur5_gripper")
from stable_baselines.common.vec_env import DummyVecEnv, SubprocVecEnv

from stable_baselines import DQN

from absl import flags

FLAGS = flags.FLAGS
FLAGS([''])

name = "dqn_mlp_std_simple"
learn_type = 'DQN'
start_value = 0

# create vectorized environment
env = DummyVecEnv([lambda: CustomAgent(learn_type=learn_type)])

model = DQN(MlpPolicy,
            env,
            learning_rate=0.3,
            exploration_fraction=0.2,
            double_q=True,
            verbose=0,
            tensorboard_log="gym_ouput/" + name + "/log/")

model.setup_model()

if start_value > 0:
    try:
        model.load("gym_ouput/" + name + "/it" + str(start_value + 1), env=env)
        print("\n\nOBS! this is not the latest NN load point\n\n")
Example #6
0
    x = x[len(x) - len(y):]

    fig = plt.figure(title)
    plt.plot(x, y)
    plt.xlabel('Number of Timesteps')
    plt.ylabel('Rewards')
    plt.title(title + " Learning Curve Smoothed")
    plt.show()


if __name__ == "__main__":
    rospy.init_node('drone_gym')
    env_id = 'Crazyflie-v0'
    log_dir = 'models/hover/empty_world_small/finalVec'

    env = DummyVecEnv([lambda: gym.make(env_id)])
    # Automatically normalize the input features and reward
    env = VecNormalize(env, norm_obs=True, norm_reward=True)

    # # Save best model every n steps and monitors performance
    # save_best_callback = SaveOnBestTrainingRewardCallback(check_freq=5, log_dir=log_dir)
    # # Save model every n steps
    # checkpoint_callback = CheckpointCallback(save_freq=5, save_path='./' + log_dir, name_prefix='ppo2')

    # Train from scratch
    model = PPO2(MlpPolicy, env, verbose=1)
    model.learn(total_timesteps=80000)
    # model.learn(total_timesteps=20, callback=[save_best_callback, checkpoint_callback])

    # Don't forget to save the VecNormalize statistics when saving the agent
    model.save(log_dir + "/ppo2_final")
Example #7
0
def create_test_env(env_id, n_envs=1, is_atari=False,
                    stats_path=None, seed=0,
                    log_dir='', should_render=True, hyperparams=None):
    """
    Create environment for testing a trained agent

    :param env_id: (str)
    :param n_envs: (int) number of processes
    :param is_atari: (bool)
    :param stats_path: (str) path to folder containing saved running averaged
    :param seed: (int) Seed for random number generator
    :param log_dir: (str) Where to log rewards
    :param should_render: (bool) For Pybullet env, display the GUI
    :param env_wrapper: (type) A subclass of gym.Wrapper to wrap the original
                        env with
    :param hyperparams: (dict) Additional hyperparams (ex: n_stack)
    :return: (gym.Env)
    """
    # HACK to save logs
    if log_dir is not None:
        os.environ["OPENAI_LOG_FORMAT"] = 'csv'
        os.environ["OPENAI_LOGDIR"] = os.path.abspath(log_dir)
        os.makedirs(log_dir, exist_ok=True)
        logger.configure()

    if hyperparams is None:
        hyperparams = {}

    # Create the environment and wrap it if necessary
    env_wrapper = get_wrapper_class(hyperparams)
    if 'env_wrapper' in hyperparams.keys():
        del hyperparams['env_wrapper']

    if is_atari:
        print("Using Atari wrapper")
        env = make_atari_env(env_id, num_env=n_envs, seed=seed)
        # Frame-stacking with 4 frames
        env = VecFrameStack(env, n_stack=4)
    elif n_envs > 1:
        # start_method = 'spawn' for thread safe
        env = SubprocVecEnv([make_env(env_id, i, seed, log_dir, wrapper_class=env_wrapper) for i in range(n_envs)])
    # Pybullet envs does not follow gym.render() interface
    elif "Bullet" in env_id:
        # HACK: force SubprocVecEnv for Bullet env
        env = SubprocVecEnv([make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper)])
    else:
        env = DummyVecEnv([make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper)])

    # Load saved stats for normalizing input and rewards
    # And optionally stack frames
    if stats_path is not None:
        if hyperparams['normalize']:
            print("Loading running average")
            print("with params: {}".format(hyperparams['normalize_kwargs']))
            env = VecNormalize(env, training=False, **hyperparams['normalize_kwargs'])

            if os.path.exists(os.path.join(stats_path, 'vecnormalize.pkl')):
                env = VecNormalize.load(os.path.join(stats_path, 'vecnormalize.pkl'), env)
                # Deactivate training and reward normalization
                env.training = False
                env.norm_reward = False
            else:
                # Legacy:
                env.load_running_average(stats_path)

        n_stack = hyperparams.get('frame_stack', 0)
        if n_stack > 0:
            print("Stacking {} frames".format(n_stack))
            env = VecFrameStack(env, n_stack)
    return env
Example #8
0
    def test_discrete_twozone_engine_with_delay(self):
        """Does the DiscreteTwoZoneEngine with injection delay work as expected?"""

        # Initialize engine
        eng = engines.DiscreteTwoZoneEngine(
            nsteps=101,
            fuel="PRF100",
            rxnmech="llnl_gasoline_surrogate_323.xml",
            mdot=0.1,
            max_minj=5e-5,
            injection_delay=0.0025,
            ename="Isooctane_MBT_DI_50C_Summ.xlsx",
            reward=rw.Reward(negative_reward=-101.0),
        )
        env = DummyVecEnv([lambda: eng])
        variables = eng.observables + eng.internals + eng.histories
        df = pd.DataFrame(
            columns=list(
                dict.fromkeys(
                    variables
                    + eng.action.actions
                    + ["rewards"]
                    + eng.reward.get_rewards()
                )
            )
        )

        # Evaluate a dummy agent that injects at a fixed time
        t0 = time.time()
        done = False
        cnt = 0
        obs = env.reset()
        df.loc[cnt, variables] = [eng.current_state[k] for k in variables]
        df.loc[cnt, eng.action.actions] = 0
        rwd = list(
            eng.reward.compute(eng.current_state, eng.nsteps, False, False).values()
        )
        df.loc[cnt, eng.reward.get_rewards()] = rwd
        df.loc[cnt, ["rewards"]] = [sum(rwd)]

        while not done:
            cnt += 1
            # Agent tries to inject thrice, but is not allowed the second time
            action = (
                [1]
                if (eng.current_state["ca"] == -10)
                or eng.current_state["ca"] == 10
                or eng.current_state["ca"] == 16
                else [0]
            )
            obs, reward, done, info = env.step(action)
            df.loc[cnt, variables] = [info[0]["current_state"][k] for k in variables]
            df.loc[cnt, eng.action.actions] = eng.action.current
            df.loc[cnt, ["rewards"]] = reward
            df.loc[cnt, eng.reward.get_rewards()] = list(info[0]["rewards"].values())

        for rwd in eng.reward.get_rewards() + ["rewards"]:
            df[f"cumulative_{rwd}"] = np.cumsum(df[rwd])

        elapsed = time.time() - t0

        utilities.plot_df(env, df, idx=5, name="DiscreteTwoZone (delay)")

        # Test
        npt.assert_allclose(np.linalg.norm(df.V), 0.002205916821815495)
        npt.assert_allclose(np.linalg.norm(df.p), 35142241.61422163)
        npt.assert_allclose(np.linalg.norm(df["T"]), 20971.07323643)
        npt.assert_allclose(np.linalg.norm(df.rewards), 153.11736491)
        npt.assert_allclose(np.linalg.norm(df.mdot), 0.14142136)
        print(f"Wall time for DiscreteTwoZoneEngine with delay = {elapsed} seconds")
Example #9
0
    def test_reactor_engine_with_complex_reward(self):
        """Does the ReactorEngine with complex reward work as expected?"""

        # Initialize engine
        reward = rw.Reward(
            names=["work", "nox", "soot"],
            norms=[1.0, 5e-8, 1e-9],
            weights=[0.34, 0.33, 0.33],
            negative_reward=-100.0,
            randomize=False,
        )
        eng = engines.ReactorEngine(
            nsteps=101,
            Tinj=300.0,
            rxnmech="dodecane_lu_nox.cti",
            mdot=0.1,
            max_minj=5e-5,
            ename="Isooctane_MBT_DI_50C_Summ.xlsx",
            reward=reward,
        )
        env = DummyVecEnv([lambda: eng])
        variables = eng.observables + eng.internals + eng.histories
        df = pd.DataFrame(
            columns=list(
                dict.fromkeys(
                    variables
                    + eng.action.actions
                    + ["rewards"]
                    + eng.reward.get_rewards()
                )
            )
        )

        # Evaluate a dummy agent that injects at a fixed time
        t0 = time.time()
        done = False
        cnt = 0
        obs = env.reset()
        df.loc[cnt, variables] = [eng.current_state[k] for k in variables]
        df.loc[cnt, eng.action.actions] = 0
        rwd = list(
            eng.reward.compute(eng.current_state, eng.nsteps, False, False).values()
        )
        df.loc[cnt, eng.reward.get_rewards()] = rwd
        df.loc[cnt, ["rewards"]] = [sum(rwd)]

        while not done:
            cnt += 1
            # Agent tries to inject twice, but is not allowed the second time
            action = (
                [1]
                if (eng.current_state["ca"] == 0) or eng.current_state["ca"] == 2
                else [0]
            )
            obs, reward, done, info = env.step(action)
            df.loc[cnt, variables] = [info[0]["current_state"][k] for k in variables]
            df.loc[cnt, eng.action.actions] = eng.action.current
            df.loc[cnt, ["rewards"]] = reward
            df.loc[cnt, eng.reward.get_rewards()] = list(info[0]["rewards"].values())

        for rwd in eng.reward.get_rewards() + ["rewards"]:
            df[f"cumulative_{rwd}"] = np.cumsum(df[rwd])

        elapsed = time.time() - t0

        utilities.plot_df(env, df, idx=6, name="reactor")

        # Test
        npt.assert_allclose(np.linalg.norm(df.V), 0.002205916821815495)
        npt.assert_allclose(np.linalg.norm(df.p), 34254670.52877185, rtol=1e-5)
        npt.assert_allclose(np.linalg.norm(df["T"]), 18668.46491609, rtol=1e-5)
        npt.assert_allclose(np.linalg.norm(df.rewards), 54.47632708, rtol=1e-5)
        npt.assert_allclose(np.linalg.norm(df.r_work), 53.47224436, rtol=1e-5)
        npt.assert_allclose(np.linalg.norm(df.r_nox), 14.10312665, rtol=1e-5)
        npt.assert_allclose(np.linalg.norm(df.w_work), 3.41695771, rtol=1e-5)
        npt.assert_allclose(np.linalg.norm(df.w_nox), 3.31645895, rtol=1e-5)
        npt.assert_allclose(np.linalg.norm(df.w_soot), 3.31645895, rtol=1e-5)
        npt.assert_allclose(np.linalg.norm(df.mdot), 0.14142135623730953)
        print(f"Wall time for ReactorEngine (complex reward) = {elapsed} seconds")
Example #10
0
    def single_run(self,
                   folder_path,
                   num_evals,
                   policy_kwargs=None,
                   is_baseline=False,
                   baseline_policy=None):
        # initialize cProfile
        profiler_object = cProfile.Profile()
        profiler_object.enable()

        config = configparser.ConfigParser()
        config.read('gym_config/config.ini')

        rl_time_steps = config.getint('rl', 'time_steps')
        ent_coef = config.getfloat('rl', 'ent_coef')
        n_steps = config.getint('rl', 'n_steps')
        nminibatches = config.getint('rl', 'nminibatches')
        noptepochs = config.getint('rl', 'noptepochs')
        learning_rate = config.getfloat('rl', 'learning_rate')
        time_steps = config.getint('garden', 'time_steps')
        step = config.getint('garden', 'step')
        num_plants_per_type = config.getint('garden', 'num_plants_per_type')
        num_plant_types = config.getint('garden', 'num_plant_types')
        garden_x = config.getint('garden', 'X')
        garden_y = config.getint('garden', 'Y')
        garden_z = 2 * config.getint(
            'garden', 'num_plant_types'
        ) + 1  # Z axis contains a matrix for every plant type plus one for water levels.
        sector_width = config.getint('garden', 'sector_width')
        sector_height = config.getint('garden', 'sector_height')
        action_low = config.getfloat('action', 'low')
        action_high = config.getfloat('action', 'high')
        obs_low = config.getint('obs', 'low')
        obs_high = config.getint('obs', 'high')

        env = gym.make(
            'simalphagarden-v0',
            wrapper_env=SimAlphaGardenWrapper(time_steps,
                                              garden_x,
                                              garden_y,
                                              sector_width,
                                              sector_height,
                                              num_plant_types,
                                              num_plants_per_type,
                                              step=step),
            garden_x=garden_x,
            garden_y=garden_y,
            garden_z=garden_z,
            sector_width=sector_width,
            sector_height=sector_height,
            action_low=action_low,
            action_high=action_high,
            obs_low=obs_low,
            obs_high=obs_high,
        )
        env = DummyVecEnv([lambda: env])
        # TODO: Normalize input features? VecNormalize
        env = VecCheckNan(env, raise_exception=False)

        if is_baseline:
            copyfile('gym_config/config.ini', folder_path + '/config.ini')

            # Evaluate baseline on 50 random environments of same parameters.
            self.evaluate_policy(folder_path,
                                 num_evals,
                                 env,
                                 garden_x,
                                 garden_y,
                                 sector_width,
                                 sector_height,
                                 is_baseline=True,
                                 baseline_policy=baseline_policy,
                                 step=1)

            # Graph evaluations
            self.graph_utils.graph_evaluations(folder_path, garden_x, garden_y,
                                               time_steps, step, num_evals,
                                               num_plant_types)
        else:
            pathlib.Path(folder_path + '/ppo_v2_tensorboard').mkdir(
                parents=True, exist_ok=True)
            # Instantiate the agent
            model = PPO2(CustomCnnPolicy,
                         env,
                         policy_kwargs=policy_kwargs,
                         ent_coef=ent_coef,
                         n_steps=n_steps,
                         nminibatches=nminibatches,
                         noptepochs=noptepochs,
                         learning_rate=learning_rate,
                         verbose=1,
                         tensorboard_log=folder_path + '/ppo_v2_tensorboard/')

            # model = PPO2(MlpPolicy, env, ent_coef=ent_coef, n_steps=n_steps, nminibatches=nminibatches, noptepochs=noptepochs, learning_rate=learning_rate, verbose=1, tensorboard_log=folder_path + '/ppo_v2_tensorboard/')
            # Train the agent
            model.learn(
                total_timesteps=rl_time_steps
            )  # this will crash explaining that the invalid value originated from the env

            model.save(folder_path + '/model')

            copyfile('gym_config/config.ini', folder_path + '/config.ini')

            # Evaluate model on 50 random environments of same parameters.
            self.evaluate_policy(folder_path,
                                 num_evals,
                                 env,
                                 garden_x,
                                 garden_y,
                                 sector_width,
                                 sector_height,
                                 is_baseline=False)

            # Graph evaluations
            # self.graph_utils.graph_evaluations(folder_path, garden_x, garden_y, time_steps, step, num_evals, num_plant_types)

        profiler_object.disable()

        # dump the profiler stats
        s = io.StringIO()
        ps = pstats.Stats(profiler_object, stream=s).sort_stats('cumulative')
        pathlib.Path(folder_path + '/Timings').mkdir(parents=True,
                                                     exist_ok=True)
        ps.dump_stats(folder_path + '/Timings/dump.txt')

        # convert to human readable format
        out_stream = open(folder_path + '/Timings/time.txt', 'w')
        ps = pstats.Stats(folder_path + '/Timings/dump.txt', stream=out_stream)
        ps.strip_dirs().sort_stats('cumulative').print_stats()
def main():
    """ Prepare for trainings """
    log_dir, model_dir = prepare_dirs()

    model_name = model_dir + '/' + MODEL_NAME
    print(f'model will be saved as {model_name}')

    log_dir = log_dir + '/' + MODEL_NAME
    """ Generate & Check environment """
    env_name = ENV_NAME
    env = gym.make(env_name)
    # print(f'Observation space: {env.observation_space}')
    # print(f'Action space: {env.action_space}')
    # env = Monitor(env, log_dir, allow_early_resets=True)
    # check_env(env)
    """ Save config as pickle file """
    config = summarize_config(env)
    save_config(log_dir, config)
    """ Vectorize environment """
    num_envs = NUM_ENVS
    env = DummyVecEnv([lambda: env for _ in range(num_envs)])  # For training

    eval_env = DummyVecEnv([lambda: gym.make(env_name)])  # For evaluation
    """ Define checkpoint callback """
    checkpoint_callback = CheckpointCallback(save_freq=SAVE_FREQ,
                                             save_path=model_name,
                                             name_prefix=MODEL_NAME)
    """ Use deterministic actions for evaluation callback """
    eval_callback = EvalCallback(eval_env,
                                 best_model_save_path=model_name,
                                 log_path=log_dir,
                                 eval_freq=EVAL_FREQ,
                                 deterministic=True,
                                 render=False,
                                 n_eval_episodes=N_EVAL_EPISODES)

    print(f'Algorithm: {ALGORITHM}\n')

    if not CONTINUAL_LEARNING:
        """ Define model """
        model = define_model(env, log_dir)
    else:
        model = load_model(env, model_dir, log_dir)
    """ Evaluate model before training """
    # mean_reward, std_reward = evaluate_policy(model=model,
    #                                          env=eval_env,
    #                                          n_eval_episodes=N_EVAL_EPISODES)
    # print(f'Before training: mean reward: {mean_reward:.2f} +/- {std_reward:.2f}')
    """ Train model """
    model.learn(total_timesteps=MAX_STEPS,
                callback=[checkpoint_callback, eval_callback])
    """ Evaluate model after training """
    # mean_reward, std_reward = evaluate_policy(model=model,
    #                                          env=eval_env,
    #                                          n_eval_episodes=N_EVAL_EPISODES)
    # print(f'After training: mean reward: {mean_reward:.2f} +/- {std_reward:.2f}')
    """ Save trained model """
    model.save(model_name)
    """ Test trained model """
    obs = eval_env.reset()
    for i in range(N_EVAL_EPISODES):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = eval_env.step(action)
        eval_env.render()

    env.close()
    eval_env.close()
Example #12
0
def load_model(algorithm, gym_env_id):
    global best_mean_reward

    model = None

    multiprocess = False
    num_cpu = 4  # Number of processes to use in multiprocess env

    env = None

    if multiprocess:
        env = SubprocVecEnv([make_env(gym_env_id, i) for i in range(num_cpu)])
    else:
        gym_env = gym.make(gym_env_id)
        monitor_file_path = log_dir + current_time_string + "-monitor.csv"
        env = Monitor(gym_env, monitor_file_path, allow_early_resets=True)

        # vectorized environments allow to easily multiprocess training
        # we demonstrate its usefulness in the next examples
        env = DummyVecEnv([
            lambda: env
        ])  # The algorithms require a vectorized environment to run

    existing_pickle_files = utils.get_files_with_pattern(
        pickle_dir, r'(.*)' + algorithm + "-best-model.pkl")

    # Sort files in reverse alphabetical order, so that models with newer dates are chosen first.
    existing_pickle_files.sort(reverse=True)

    for file_name in existing_pickle_files:
        search = re.search(r'(.*)' + algorithm + "-best-model.pkl", file_name)

        if search:
            if algorithm == 'deepq':
                model = DQN.load(file_name,
                                 env=env,
                                 verbose=verbose_level,
                                 tensorboard_log=tensorboard_dir)
            elif algorithm == 'ppo2':
                model = PPO2.load(file_name,
                                  env=env,
                                  verbose=verbose_level,
                                  tensorboard_log=tensorboard_dir)
            else:
                raise Exception(
                    "Algorithm not supported: {}".format(algorithm))

            logger.info(
                "Loading existing pickle file '{}' for environment {} with algorithm {} and policy '{}'."
                .format(file_name, gym_env_id, algorithm, model.policy))

            logger.info(
                "Searching for previous best mean reward of algorithm '{}'...".
                format(algorithm))

            best_mean_reward = get_best_mean_reward_from_results()

            if best_mean_reward != -np.inf:
                logger.info("Found previous best mean reward: {}".format(
                    best_mean_reward))
            else:
                logger.info(
                    "Could not find previous best mean reward. Starting with: {}"
                    .format(best_mean_reward))

            return model

    logger.info(
        "No pickle was found for environment {}. Creating new model with algorithm {} and policy 'MlpPolicy'..."
        .format(gym_env_id, algorithm))

    if algorithm == 'deepq':
        model = DQN(policy='MlpPolicy',
                    env=env,
                    verbose=verbose_level,
                    tensorboard_log=tensorboard_dir)
    if algorithm == 'ppo2':
        model = PPO2(policy='MlpPolicy',
                     env=env,
                     verbose=verbose_level,
                     tensorboard_log=tensorboard_dir)

    return model
env_name = "AirSimNH-v0"
if env_name in gym.envs.registry.env_specs:
    del gym.envs.registry.env_specs[env_name]

# register environment
gym.register(id=env_name,
             entry_point=envs.AirSimSimplifiedActionMetaRLEnv.AirSimEnv)

from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, LstmPolicy, register_policy
from stable_baselines.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines import PPO2

env = gym.make("AirSimNH-v0")
# Vectorized environments allow to easily multiprocess training
# we demonstrate its usefulness in the next examples
env = DummyVecEnv([lambda: env
                   ])  # The algorithms require a vectorized environment to run
env = VecNormalize(env, norm_obs=True, norm_reward=True)

# Meta RL basically uses an LSTM policy
model = PPO2(MlpLstmPolicy,
             env,
             nminibatches=1,
             verbose=1,
             tensorboard_log="./ppo_tensorboard/")
#model = PPO2.load("save_models/ppo_lidar_simplified_fixed_99", env=env, verbose=1, tensorboard_log="./ppo_tensorboard/")

# Train the agent
for i in range(0, 100):
    model.learn(total_timesteps=10000, tb_log_name="lidar_metarl_")
    # Save trained model
    model.save("ppo_lidar_metarl_" + str(i))
Example #14
0
    n_timesteps = args.n_timesteps
else:
    n_timesteps = int(hyperparams['n_timesteps'])
del hyperparams['n_timesteps']

normalize = False
normalize_kwargs = {}
if 'normalize' in hyperparams.keys():
    normalize = hyperparams['normalize']
    if isinstance(normalize, str):
        normalize_kwargs = eval(normalize)
        normalize = True
    del hyperparams['normalize']

if not args.teleop:
    env = DummyVecEnv([make_env(args.seed, vae=vae, teleop=args.teleop)])
else:
    env = make_env(args.seed,
                   vae=vae,
                   teleop=args.teleop,
                   n_stack=hyperparams.get('frame_stack', 1))()

if normalize:
    if hyperparams.get('normalize', False) and args.algo in ['ddpg']:
        print("WARNING: normalization not supported yet for DDPG")
    else:
        print("Normalizing input and return")
        env = VecNormalize(env, **normalize_kwargs)

# Optional Frame-stacking
n_stack = 1
Example #15
0
def train(algo,
          df,
          model_name,
          uniqueId,
          lr=None,
          gamma=None,
          noBacktest=1,
          cutoff_date=None,
          commission=0,
          addTA='N'):
    before = np.zeros(noBacktest)
    after = np.zeros(noBacktest)
    backtest = np.zeros(noBacktest)
    train_dates = np.empty(noBacktest, dtype="datetime64[s]")
    start_test_dates = np.empty(noBacktest, dtype="datetime64[s]")
    end_test_dates = np.empty(noBacktest, dtype="datetime64[s]")
    # print(str(df.columns.tolist()))

    dates = np.unique(df.date)
    logfile = "./log/"
    print("noBacktest", noBacktest)

    # backtest=1 uses cut of date to split train/test
    cutoff_date = np.datetime64(cutoff_date)
    print("cutoff_date", cutoff_date)

    if noBacktest == 1:
        a = np.where(dates <= cutoff_date)[0]
        b = np.where(dates > cutoff_date)[0]
        s = []
        s.append((a, b))

    else:
        # ref https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html
        splits = TimeSeriesSplit(n_splits=noBacktest)
        s = splits.split(dates)

    loop = 0

    for train_date_index, test_date_index in s:
        print("loop", loop)
        train = df[df.date.isin(dates[train_date_index])]
        test = df[df.date.isin(dates[test_date_index])]
        runtimeId = uniqueId + "_" + str(loop)
        train_dates[loop] = max(train.date)
        start_test_dates[loop] = min(test.date)
        end_test_dates[loop] = max(test.date)

        n_actions = 1
        action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                         sigma=0.1 * np.ones(n_actions))
        global env

        title = runtimeId + "_Train lr=" + \
            str(lr) + ", cliprange=" + str(cliprange) + ", commission=" + str(commission)
        env = DummyVecEnv([
            lambda: StockEnvPlayer(train,
                                   logfile + runtimeId + ".csv",
                                   title,
                                   seed=seed,
                                   commission=commission,
                                   addTA=addTA)
        ])

        # Automatically normalize the input features
        env = VecNormalize(env, norm_obs=True, norm_reward=False, clip_obs=10.)

        model = algo(
            MlpPolicy,
            env,
            seedy=seed,
            gamma=g,
            n_steps=128,
            ent_coef=0.01,
            learning_rate=lr,
            vf_coef=0.5,
            max_grad_norm=0.5,
            lam=0.95,
            nminibatches=4,
            noptepochs=4,
            cliprange=cliprange,
            cliprange_vf=None,  # tensorboard_log="./tensorlog",
            _init_setup_model=True,
            policy_kwargs=None,
            full_tensorboard_log=False,
        )

        # Random Agent, before training
        print("\n*** Agent before learning ***")
        steps = len(np.unique(train.date))
        before[loop] = evaluate(model, num_steps=steps)

        model.learn(total_timesteps=round(steps))

        print("\n*** Evaluate the trained agent ***")
        after[loop] = evaluate(model, num_steps=steps)

        print("\n*** Run agent on unseen data ***")
        title = runtimeId + "_Test lr=" + \
            str(lr) + ", cliprange=" + str(cliprange) + ", commission=" + str(commission)
        env = DummyVecEnv([
            lambda: StockEnvPlayer(test,
                                   logfile + runtimeId + ".csv",
                                   title,
                                   seed=seed,
                                   commission=commission,
                                   addTA=addTA)
        ])
        env = VecNormalize(env, norm_obs=True, norm_reward=False, clip_obs=10.)
        steps = len(np.unique(test.date))
        backtest[loop] = evaluate(model, num_steps=steps)

        del model
        env.close()

        loop += 1

    # display result on screen
    for i in range(noBacktest):
        print("\ntrain_dates:", min(df.date), train_dates[i])
        print("test_dates:", start_test_dates[i], end_test_dates[i])
        print(
            "backtest {} : SUM reward : before | after | backtest : {: 8.2f} | {: 8.2f} | {: 8.2f}"
            .format(i, before[i], after[i], backtest[i]))

    return pd.DataFrame({
        "Model": uniqueId,
        "addTA": addTA,
        "Columns": str(df.columns.tolist()),
        "commission": commission,
        "Seed": seed,
        "cliprange": cliprange,
        "learningRate": lr,
        "gamma": g,
        "backtest  # ": np.arange(noBacktest),
        "StartTrainDate": min(train.date),
        "EndTrainDate": train_dates,
        "before": before,
        "after": after,
        "testDate": end_test_dates,
        "Sum Reward@roadTest": backtest
    })
Example #16
0
    def test_equilibrate_engine(self):
        """Does the EquilibrateEngine work as expected?"""

        # Initialize engine
        eng = engines.EquilibrateEngine(
            nsteps=101,
            Tinj=300.0,
            rxnmech="dodecane_lu_nox.cti",
            mdot=0.1,
            max_minj=5e-5,
            ename="Isooctane_MBT_DI_50C_Summ.xlsx",
            reward=rw.Reward(negative_reward=-0.05),
        )
        env = DummyVecEnv([lambda: eng])
        variables = eng.observables + eng.internals + eng.histories
        df = pd.DataFrame(
            columns=list(
                dict.fromkeys(
                    variables
                    + eng.action.actions
                    + ["rewards"]
                    + eng.reward.get_rewards()
                )
            )
        )

        # Evaluate a dummy agent that injects at a fixed time
        t0 = time.time()
        done = False
        cnt = 0
        obs = env.reset()
        df.loc[cnt, variables] = [eng.current_state[k] for k in variables]
        df.loc[cnt, eng.action.actions] = 0
        rwd = list(
            eng.reward.compute(eng.current_state, eng.nsteps, False, False).values()
        )
        df.loc[cnt, eng.reward.get_rewards()] = rwd
        df.loc[cnt, ["rewards"]] = [sum(rwd)]

        while not done:
            cnt += 1
            # Agent tries to inject twice, but is not allowed the second time
            action = (
                [1]
                if (eng.current_state["ca"] == -10) or eng.current_state["ca"] == 10
                else [0]
            )
            obs, reward, done, info = env.step(action)
            df.loc[cnt, variables] = [info[0]["current_state"][k] for k in variables]
            df.loc[cnt, eng.action.actions] = eng.action.current
            df.loc[cnt, ["rewards"]] = reward
            df.loc[cnt, eng.reward.get_rewards()] = list(info[0]["rewards"].values())

        for rwd in eng.reward.get_rewards() + ["rewards"]:
            df[f"cumulative_{rwd}"] = np.cumsum(df[rwd])

        elapsed = time.time() - t0

        utilities.plot_df(env, df, idx=4, name="EQ")

        # Test
        npt.assert_allclose(np.linalg.norm(df.V), 0.002205916821815495)
        npt.assert_allclose(np.linalg.norm(df.p), 35436062.48197973)
        npt.assert_allclose(np.linalg.norm(df["T"]), 12491.93935531)
        npt.assert_allclose(np.linalg.norm(df.rewards), 118.62610333)
        npt.assert_allclose(np.linalg.norm(df.mdot), 0.14142136)
        print(f"Wall time for EquilibrateEngine = {elapsed} seconds")
    img = cv2.cvtColor(img, cv2.COLOR_RGBA2RGB)
    img = cv2.resize(img, dsize=(224, 224))
    img = np.rollaxis(img, 2, 0)
    return np.resize(img, new_shape=(1, 3, 224, 224))


# def predict(img):
#     with tf.device('/device:gpu:1'):
#         ans = roadDetection.predict(np.array([img]))
#     return ans

# random.seed(42)

env = gym.make("CarRacing-v0")
env.seed(42)
env = DummyVecEnv([env])
obs = env.reset()

# fileName = "50timesteps/carracing_episode_1200.pth"
dones = False

numEpisodes = 250000
inc = 1200
n_latent_var = 64

lr = 0.002
betas = (0.9, 0.999)
gamma = 0.9  # discount factor
K_epochs = 4  # update policy for K epochs
eps_clip = 0.2  # clip parameter for PPO
max_timesteps = 3000
Example #18
0
def run_model_stablebaseline(flow_params, args, model_params=None):
    """Run the model for num_steps if provided.

    Parameters
    ----------
    flow_params :
        Flow related parameters from config.
    args:
        Training arguments from parser.

    Returns
    -------
    stable_baselines.*
        the trained model
    """
    constructor = env_constructor(params=flow_params, version=0)()
    # The algorithms require a vectorized environment to run
    env = DummyVecEnv([lambda: constructor])

    if model_params is None:
        if args.policy == 0:
            policy = MlpPolicy
        elif args.policy == 1:
            policy = LnMlpPolicy
        else:
            warnings.warn("Invalid policy type! Policy set to MlpPolicy.")
            policy = MlpPolicy
        dueling = None if args.dueling else dict(dueling=False)

        train_model = DQN(
            policy=policy,
            env=env,
            gamma=args.gamma,
            learning_rate=args.learning_rate,
            buffer_size=args.buffer_size,
            exploration_fraction=args.exploration_fraction,
            exploration_final_eps=args.exploration_final_eps,
            exploration_initial_eps=args.exploration_initial_eps,
            train_freq=args.train_freq,
            batch_size=args.batch_size,
            double_q=args.double_q,
            learning_starts=args.learning_starts,
            target_network_update_freq=args.target_network_update_freq,
            prioritized_replay=args.prioritized_replay,
            prioritized_replay_alpha=args.prioritized_replay_alpha,
            prioritized_replay_beta0=args.prioritized_replay_beta0,
            prioritized_replay_beta_iters=args.prioritized_replay_beta_iters,
            prioritized_replay_eps=args.prioritized_replay_eps,
            param_noise=args.param_noise,
            policy_kwargs=dueling,
            verbose=args.verbose,
            tensorboard_log=args.tensorboard_log,
            full_tensorboard_log=args.full_tensorboard_log)
    else:
        train_model = DQN(
            policy=model_params["policy"],
            env=env,
            gamma=model_params["gamma"],
            learning_rate=model_params["learning_rate"],
            buffer_size=model_params["buffer_size"],
            exploration_fraction=model_params["exploration_fraction"],
            exploration_final_eps=model_params["exploration_final_eps"],
            exploration_initial_eps=model_params["exploration_initial_eps"],
            train_freq=model_params["train_freq"],
            batch_size=model_params["batch_size"],
            double_q=model_params["double_q"],
            learning_starts=model_params["learning_starts"],
            target_network_update_freq=model_params[
                "target_network_update_freq"],
            prioritized_replay=model_params["prioritized_replay"],
            prioritized_replay_alpha=model_params["prioritized_replay_alpha"],
            prioritized_replay_beta0=model_params["prioritized_replay_beta0"],
            prioritized_replay_beta_iters=model_params[
                "prioritized_replay_beta_iters"],
            prioritized_replay_eps=model_params["prioritized_replay_eps"],
            param_noise=model_params["param_noise"],
            policy_kwargs=model_params["policy_kwargs"],
            verbose=model_params["verbose"],
            tensorboard_log=model_params["tensorboard_log"],
            full_tensorboard_log=model_params["full_tensorboard_log"])

    train_model.learn(total_timesteps=args.num_steps)

    return train_model
Example #19
0
def objective(params):
    """
    Objective function to be minimized.

    Parameters
    ----------
    * params [list, len(params)=n_hyperparameters]
        Settings of each hyperparameter for a given optimization iteration.
        - Controlled by hyperspaces's hyperdrive function.
        - Order preserved from list passed to hyperdrive's hyperparameters argument.
     """
    config_path = join(path, 'rl', 'config', '{}.yml'.format(env_name))
    with open(config_path) as f:
        config = yaml.safe_load(f)
        print('model loaded from path: {}'.format(config_path))

    #set the parameters
    itrf, ccrf, opr, dr = params
    config['environment']['idle_time_reward_factor'] = itrf
    config['environment']['cycle_count_reward_factor'] = ccrf
    config['environment']['output_priming_reward'] = opr / 100
    config['environment']['delivery_reward'] = dr

    print(
        'Current settings for the config: \n\nidle_time_reward_factor \t:\t{}\ncycle_count_reward_factor\t\t:\t{}\n\
output_priming_reward\t\t\t:\t{}\ndelivery_reward\t\t\t:\t{}\n'.format(
            itrf, ccrf, opr / 100, dr))

    #GET MODEL CONFIG
    model_config = config['models']['PPO2']
    policy = config['main']['policy']
    n_workers = config['main']['n_workers']
    n_steps = config['main']['n_steps']
    n_eval = (n_steps / 8) / 10

    # load environment with config variables
    env_obj = getattr(rl.environments, env_name)
    env = env_obj(config)

    # multiprocess environment
    env_8 = make_vec_env(lambda: env, n_envs=n_workers)

    #define folder and path
    now = datetime.datetime.now()
    folder = '{}{}{}_{}{}'.format(now.year,
                                  str(now.month).zfill(2),
                                  str(now.day).zfill(2),
                                  str(now.hour).zfill(2),
                                  str(now.minute).zfill(2))
    specified_path = join(path, 'rl', 'trained_models', env_name,
                          'hyper-parameter',
                          '{}-{}{}{}{}'.format(folder, itrf, ccrf, opr, dr))
    print('Results stored in: {}'.format(specified_path))

    # callback for evaluation
    eval_callback = EvalCallback(env,
                                 best_model_save_path=specified_path,
                                 log_path=specified_path,
                                 eval_freq=n_eval,
                                 n_eval_episodes=5,
                                 verbose=0,
                                 deterministic=False,
                                 render=False)

    model = PPO2(policy,
                 env=env_8,
                 tensorboard_log=specified_path,
                 **model_config)

    #LEARN MODEL
    model.learn(total_timesteps=n_steps,
                tb_log_name='{}_{}_{}_{}'.format(itrf, ccrf, opr, dr),
                callback=eval_callback)
    model_path = join(specified_path,
                      'model_{}_{}_{}_{}.zip'.format(itrf, ccrf, opr, dr))
    model.save(model_path)

    #test
    best_modelpath = join(specified_path, 'best_model.zip')
    test_model = PPO2.load(best_modelpath, env=DummyVecEnv([lambda: env]))

    #run test of the model
    episodes = 10
    results = {}
    results['cycle_count'] = 0
    results['idle_time'] = 0
    for episode in range(episodes):
        # Run an episode
        state = env.reset()
        done = False
        meta_data = []
        while not done:
            action, _ = test_model.predict(state, deterministic=True)
            state, reward, done, _ = env.step(action)
            if done:
                results['cycle_count'] += env.cycle_count
                results['idle_time'] += sum(env.idle_times_operator.values())

    return (results['cycle_count'] + results['idle_time']) / episodes
Example #20
0
class StableBaselinesTradingStrategy(TradingStrategy):
    """A trading strategy capable of self tuning, training, and evaluating with stable-baselines.

    Arguments:
        environments: An instance of a trading environments for the agent to trade within.
        model: The RL model to create the agent with.
            Defaults to DQN.
        policy: The RL policy to train the agent's model with.
            Defaults to 'MlpPolicy'.
        model_kwargs: Any additional keyword arguments to adjust the model.
        kwargs: Optional keyword arguments to adjust the strategy.
    """
    def __init__(self,
                 environment: TradingEnvironment,
                 model: BaseRLModel = DQN,
                 policy: Union[str, BasePolicy] = 'MlpPolicy',
                 model_kwargs: any = {},
                 **kwargs):
        self._model = model
        self._model_kwargs = model_kwargs

        self.environment = environment
        self._agent = self._model(policy, self._environment,
                                  **self._model_kwargs)

    @property
    def environment(self) -> 'TradingEnvironment':
        """A `TradingEnvironment` instance for the agent to trade within."""
        return self._environment

    @environment.setter
    def environment(self, environment: 'TradingEnvironment'):
        self._environment = DummyVecEnv([lambda: environment])

    def restore_agent(self, path: str):
        """Deserialize the strategy's learning agent from a file.

        Arguments:
            path: The `str` path of the file the agent specification is stored in.
        """
        self._agent = self._model.load(path, self._environment,
                                       self._model_kwargs)

    def save_agent(self, path: str):
        """Serialize the learning agent to a file for restoring later.

        Arguments:
            path: The `str` path of the file to store the agent specification in.
        """
        self._agent.save(path)

    def tune(self,
             steps: int = None,
             episodes: int = None,
             callback: Callable[[pd.DataFrame], bool] = None) -> pd.DataFrame:
        raise NotImplementedError

    def run(
        self,
        steps: int = None,
        episodes: int = None,
        episode_callback: Callable[[pd.DataFrame],
                                   bool] = None) -> pd.DataFrame:
        if steps is None and episodes is None:
            raise ValueError(
                'You must set the number of `steps` or `episodes` to run the strategy.'
            )

        steps_completed = 0
        episodes_completed = 0
        average_reward = 0

        obs, state, dones = self._environment.reset(), None, [False]

        performance = {}

        while (steps is not None and
               (steps == 0 or steps_completed < steps)) or (
                   episodes is not None and episodes_completed < episodes):
            actions, state = self._agent.predict(obs, state=state, mask=dones)
            obs, rewards, dones, info = self._environment.step(actions)

            steps_completed += 1
            average_reward -= average_reward / steps_completed
            average_reward += rewards[0] / (steps_completed + 1)

            exchange_performance = info[0].get('exchange').performance
            performance = exchange_performance if len(
                exchange_performance) > 0 else performance

            if dones[0]:
                if episode_callback is not None and not episode_callback(
                        performance):
                    break

                episodes_completed += 1
                obs = self._environment.reset()

        print("Finished running strategy.")
        print("Total episodes: {} ({} timesteps).".format(
            episodes_completed, steps_completed))
        print("Average reward: {}.".format(average_reward))

        return performance
Example #21
0
    set_global_seeds(seed)
    return _init


if __name__ == "__main__":
    worker_id = 10
    num_env = 2
    env_id = "/home/jim/projects/unity_ray/basic_env_linux/basic_env_linux"
    env = UnityEnv(env_id, worker_id=worker_id, use_visual=False)
    # Create log dir
    time_int = int(time.time())
    log_dir = "stable_results/basic_env_{}/".format(time_int)
    os.makedirs(log_dir, exist_ok=True)

    env = Monitor(env, log_dir, allow_early_resets=True)
    env = DummyVecEnv([lambda: env])  # The algorithms require a vectorized environment to run
    #env = SubprocVecEnv([make_env(env_id, log_dir, i+worker_id) for i in range(num_env)])

    model = TRPO(MlpPolicy, env, verbose=1)
    model.learn(total_timesteps=20000)
    model.save(log_dir+"model")

    #evaluate agent
    episodes = 100
    ep_r = []
    ep_l = []
    for e in range(episodes):
        obs = env.reset()
        total_r = 0.
        total_l = 0.
        while True:
Example #22
0
 def environment(self, environment: 'TradingEnvironment'):
     self._environment = DummyVecEnv([lambda: environment])
Example #23
0
        register(id='simglucose-' + patient_id + '-v0',
                 entry_point='simglucose.envs:T1DSimEnv',
                 kwargs={'patient_name': env_id})

        env = gym.make('simglucose-' + patient_id + '-v0')
        env.seed(seed)
        print(env_id)
        return env

    set_global_seeds(seed)
    return _init


if __name__ == "__main__":
    env = DummyVecEnv([
        make_env('adult#0{}'.format(str(i).zfill(2)), i) for i in range(1, 11)
    ])
    # model = SAC(LnMlpPolicy, env, verbose=1)
    model = ACKTR(MlpLstmPolicy, env, verbose=1)
    model.learn(total_timesteps=256000)
    model.save("ACKTR_MlpLSTM_adult_def_reward")

# for i,p in enumerate(child_options):
#     patient_id = p.split('#')[0] + str(i + 1)

#     register(
#         id='simglucose-' + patient_id + '-v0',
#         entry_point='simglucose.envs:T1DSimEnv',
#         kwargs={'patient_name': p}
#     )
Example #24
0
    parser.add_argument(
        '--save-freq',
        help='Save the model every n steps (if negative, no checkpoint)',
        default=-1,
        type=int)
    args = parser.parse_args()

    env_id = args.env
    n_timesteps = args.n_timesteps
    save_path = '{}_{}'.format(args.algo, env_id)

    # Instantiate and wrap the environment
    env = TimeFeatureWrapper(gym.make(env_id))

    # Create the evaluation environment and callbacks
    eval_env = DummyVecEnv([lambda: TimeFeatureWrapper(gym.make(env_id))])

    callbacks = [EvalCallback(eval_env, best_model_save_path=save_path)]

    # Save a checkpoint every n steps
    if args.save_freq > 0:
        callbacks.append(
            CheckpointCallback(save_freq=args.save_freq,
                               save_path=save_path,
                               name_prefix='rl_model'))

    algo = {'sac': SAC, 'td3': TD3}[args.algo]

    n_actions = env.action_space.shape[0]

    # Tuned hyperparameters from https://github.com/araffin/rl-baselines-zoo
Example #25
0
    def proba_step(self, obs, state=None, mask=None):
        return self.sess.run(self.policy_proba, {self.obs_ph: obs})

    def value(self, obs, state=None, mask=None):
        return self.sess.run(self._value, {self.obs_ph: obs})


if __name__ == '__main__':

    rospy.init_node('segbot_collision_avoid',
                    anonymous=True,
                    log_level=rospy.WARN)

    # Create the Gym environment
    env = gym.make('HallwayCollision-v0')
    env = DummyVecEnv([lambda: env])

    rospy.loginfo("Gym environment done")

    # Set the logging system
    rospack = rospkg.RosPack()
    pkg_path = rospack.get_path('haresh_segbot_hallway')
    # outdir = pkg_path + '/training_results'
    # env = wrappers.Monitor(env, outdir, force=True)
    # rospy.loginfo("Monitor Wrapper started")

    # Loads parameters from the ROS param server
    # Parameters are stored in a yaml file inside the config directory
    # They are loaded at runtime by the launch file

    env.reset()
Example #26
0
    with open(
            join(specified_path, 'Bestmodel_{}'.format(args.name),
                 'config.yml'), 'r') as f:
        config = yaml.safe_load(f)

    config_env = config['environment']
    amount_of_actions = config_env['amount_output']
    stop = amount_of_actions + 1
    #load environment with config variables
    env_obj = getattr(rl.environments, args.environment)
    env = env_obj(config)

    modelpath = join(specified_path, 'Bestmodel_{}'.format(args.name),
                     'best_model.zip')
    model = PPO2.load(modelpath, env=DummyVecEnv([lambda: env]))
    print(args.render)
    for episode in range(10):
        # Run an episode
        state = env.reset()
        size = state.shape[0]
        done = False
        meta_data = []
        while not done:
            action, _ = model.predict(state)
            logging.debug(model.action_probability(state))

            if args.render:
                ## monitoring
                r = 4
                state_n = state
Example #27
0
    def objective(trial):

        kwargs = hyperparams.copy()

        trial.model_class = None

        kwargs.update(sample_ppo2_params(trial))

        def callback(_locals, _globals):
            """
            Callback for monitoring learning progress.
            :param _locals: (dict)
            :param _globals: (dict)
            :return: (bool) If False: stop training
            """
            self_ = _locals['self']
            trial = self_.trial

            # Initialize variables
            if not hasattr(self_, 'is_pruned'):
                self_.is_pruned = False
                self_.last_mean_test_reward = -np.inf
                self_.last_time_evaluated = 0
                self_.eval_idx = 0

            if (self_.num_timesteps - self_.last_time_evaluated) < evaluate_interval:
                return True

            self_.last_time_evaluated = self_.num_timesteps

            # Evaluate the trained agent on the test env
            rewards = []
            n_steps_done, reward_sum = 0, 0.0

            # Sync the obs rms if using vecnormalize
            # NOTE: this does not cover all the possible cases
            if isinstance(self_.test_env, VecNormalize):
                self_.test_env.obs_rms = deepcopy(self_.env.obs_rms)
                self_.test_env.ret_rms = deepcopy(self_.env.ret_rms)
                # Do not normalize reward
                self_.test_env.norm_reward = False

            obs = self_.test_env.reset()
            while n_steps_done < n_test_steps:
                # Use default value for deterministic
                action, _ = self_.predict(obs)
                obs, reward, done, _ = self_.test_env.step(action)
                reward_sum += reward
                n_steps_done += 1

                if done:
                    rewards.append(reward_sum)
                    reward_sum = 0.0
                    obs = self_.test_env.reset()
            rewards.append(reward_sum)
            mean_reward = np.mean(rewards)
            summary = tf.Summary(value=[tf.Summary.Value(tag='evaluation', simple_value=mean_reward)])
            _locals['writer'].add_summary(summary, self_.num_timesteps)
            self_.last_mean_test_reward = mean_reward
            self_.eval_idx += 1

            # report best or report current ?
            # report num_timesteps or elasped time ?
            trial.report(-1 * mean_reward, self_.eval_idx)
            # Prune trial if need
            if trial.should_prune(self_.eval_idx):
                self_.is_pruned = True
                return False

            return True
        commands = [[1,0],[2,0],[3,0]]
        env = SubprocVecEnv([lambda:  e.AidaBulletEnv(commands,
                                                  render  = False, 
                                                  on_rack = False,
                                                  default_reward     = 2,
                                                  height_weight      = 5,
                                                  orientation_weight = 3,
                                                  direction_weight   = 2,
                                                  speed_weight       = 5
                                                  )
                        for i in range(32)])
        if(kwargs['normalize']):
            env = VecNormalize(env, clip_obs=1000.0, clip_reward=1000.0, gamma=kwargs['gamma'])

        model = PPO2(MlpPolicy, 
                 env, 
                 vf_coef         = 0.5,
                 max_grad_norm   = 0.5,
                 cliprange_vf    = -1,
                 verbose         = 0,
                 n_steps = kwargs['n_steps'],
                 nminibatches = kwargs['nminibatches'],
                 gamma = kwargs['gamma'],
                 learning_rate = kwargs['learning_rate'],
                 ent_coef = kwargs['ent_coef'],
                 cliprange = kwargs['cliprange'],
                 noptepochs = kwargs['noptepochs'],
                 lam = kwargs['lam'],
                 policy_kwargs   = dict(layers=[100,100]),
                 tensorboard_log = "./optimisation/logOPTI"
               )
        model.test_env = DummyVecEnv([lambda:  e.AidaBulletEnv(commands,
                                                  render  = False, 
                                                  on_rack = False,
                                                  default_reward     = 2,
                                                  height_weight      = 5,
                                                  orientation_weight = 3,
                                                  direction_weight   = 2,
                                                  speed_weight       = 2
                                                  )
                        ])
        if(kwargs['normalize']):
            model.test_env = VecNormalize(model.test_env, clip_obs=1000.0, clip_reward=1000.0, gamma=kwargs['gamma'],training=False, norm_reward=False)

        model.trial = trial
       
        try:
            model.learn(n_timesteps, callback=callback)
            # Free memory
            model.env.close()
            model.test_env.close()
        except AssertionError:
            # Sometimes, random hyperparams can generate NaN
            # Free memory
            model.env.close()
            model.test_env.close()
            raise
        is_pruned = False
        cost = np.inf
        if hasattr(model, 'is_pruned'):
            is_pruned = model.is_pruned
            cost = -1 * model.last_mean_test_reward
        try:
            os.mkdir("./optimisation/resultats/"+str(trial.number))
        except FileExistsError:
            print("Directory already exists")
            
        if kwargs['normalize']:
            try:
                os.mkdir("./optimisation/resultats/"+str(trial.number)+"/normalizeData")
            except FileExistsError:
                print("Directory already exists")

        model.save("./optimisation/resultats/"+str(trial.number)+"/"+str(trial.number))    
        if kwargs['normalize']:
            model.env.save_running_average("./optimisation/resultats/"+str(trial.number) +"/normalizeData")
        

        del model.env, model.test_env
        del model

        if is_pruned:
            try:
                # Optuna >= 0.19.0
                raise optuna.exceptions.TrialPruned()
            except AttributeError:
                raise optuna.structs.TrialPruned()

        return cost
    LOGPATH = os.path.join(LOGDIR, LOGNAME + ".csv")
    MODELPATH = os.path.join(DIR, LOGNAME + "_ckpt")
    MODELPATH2 = os.path.join(DIR, "e2enavreptrainenv_latest_PPO_ckpt")
    if not os.path.exists(DIR):
        os.makedirs(DIR)
    if not os.path.exists(LOGDIR):
        os.makedirs(LOGDIR)

    MILLION = 1000000
    TRAIN_STEPS = args.n
    if TRAIN_STEPS is None:
        TRAIN_STEPS = 60 * MILLION

    N_ENVS = 6
    if args.debug:
        env = DummyVecEnv([lambda: E2ENavRepEnv(silent=True, scenario='train')]*N_ENVS)
    else:
        env = SubprocVecEnv([lambda: E2ENavRepEnv(silent=True, scenario='train')]*N_ENVS,
                            start_method='spawn')
    eval_env = E2ENavRepEnv(silent=True, scenario='train')
    def test_env_fn():  # noqa
        return E2ENavRepEnv(silent=True, scenario='test')
    cb = NavrepEvalCallback(eval_env, test_env_fn=test_env_fn,
                            logpath=LOGPATH, savepath=MODELPATH, verbose=1)
    model = PPO2(CustomPolicy, env, verbose=0)
    model.learn(total_timesteps=TRAIN_STEPS+1, callback=cb)
    obs = env.reset()

    model.save(MODELPATH)
    model.save(MODELPATH2)
    print("Model '{}' saved".format(MODELPATH))
import numpy as np
import matplotlib.pyplot as plt
from stable_baselines.common import set_global_seeds
from stable_baselines.bench import Monitor
from stable_baselines.results_plotter import load_results, ts2xy
import json
best_mean_reward, n_steps = -np.inf, 0
best_eval_mean_reward = -np.inf
seed = 500 
log_dir = "logs/mujoco/Hopper_skipq_"+str(seed)+ "/"
os.makedirs(log_dir, exist_ok=True)
log_data = {'dt':[],'eval':[],'train':[],'timesteps':[]}

f = open(log_dir+"eval.txt", "w")
set_global_seeds(seed)
test_env = DummyVecEnv([lambda: gym.make("Hopper-v2")])
max_eval_timesteps = 5000
# Automatically normalize the input features
# test_env = VecNormalize(test_env, norm_obs=True, norm_reward=False,
#                         clip_obs=10.)



def callback(_locals, _globals):
    """
    Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2)
    :param _locals: (dict)
    :param _globals: (dict)
    """

Example #30
0
#     'gamma': params['gamma'],
#     'learning_rate': params['learning_rate'],
#     'ent_coef': params['ent_coef'],
#     'cliprange': params['cliprange'],
#     'noptepochs': int(params['noptepochs']),
#     'lam': params['lam'],
# }

# model = PPO2.load('./agents/ppo2_' + reward_strategy + '_' + str(curr_idx) + '_6' +  '.pkl', env=test_env)

while True:
    test_df = df_init[16400:]
    test_env = DummyVecEnv([
        lambda: BitcoinTradingEnv(test_df,
                                  reward_func=reward_strategy,
                                  forecast_len=int(params['forecast_len']),
                                  confidence_interval=params[
                                      'confidence_interval'])
    ])
    print('after while')
    df_init = pd.read_csv('binance.csv')
    # df = df.drop(['Symbol'], axis=1)
    df_init = df_init.sort_values(['Date'])
    df_init = add_indicators(df_init.reset_index())

    test_len = int(len(df_init) * 0.021)
    train_len = int(len(df_init)) - test_len

    test_df = df_init[16385:]
    test_env = DummyVecEnv([
        lambda: BitcoinTradingEnv(test_df,