from examples.utils.utils import get_policy tensorboard_folder = './tensorboard/Snake/base/' model_folder = './models/Snake/base/' if not os.path.isdir(tensorboard_folder): os.makedirs(tensorboard_folder) if not os.path.isdir(model_folder): os.makedirs(model_folder) policy = '' model_tag = '' if len(sys.argv) > 1: policy = sys.argv[1] model_tag = '_' + sys.argv[1] env = DummyVecEnv([lambda: BaseEnv(10, 10)]) model = PPO2(get_policy(policy), env, verbose=0, nminibatches=1, tensorboard_log=tensorboard_folder) model.learn(total_timesteps=10000000, tb_log_name='PPO2' + model_tag) model.save(model_folder + "PPO2" + model_tag) del model model = PPO2.load(model_folder + "PPO2" + model_tag) done = False states = None obs = env.reset()
# Plot cumulative reward with open(os.path.join(log_dir, "monitor.csv"), 'rt') as fh: firstline = fh.readline() assert firstline[0] == '#' df = pd.read_csv(fh, index_col=None)['r'] df.rolling(window=1000).mean().plot() plt.show() return model if __name__ == '__main__': env = ConnectFourGym(agent2="random") log_dir = "ppo/" os.makedirs(log_dir, exist_ok=True) # Logging progress monitor_env = Monitor(env, log_dir, allow_early_resets=True) # Create a vectorized environment vec_env = DummyVecEnv([lambda: monitor_env]) # Initialize agent model = get_model(vec_env) # Train agent model = train_model(model) env_game = make("connectx") env_game.run([agent1, "random"]) get_win_percentages(agent1=agent1, agent2="random")
# Load the learning parameters from a file. param_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'param_files') if args.param_file is None: default_path = os.path.join(param_dir, 'default_params.json') with open(default_path) as f: params = commentjson.load(f)[args.default_name] else: param_file = os.path.join(param_dir, args.param_file) with open(param_file) as f: params = commentjson.load(f) # Visualize. env_cls = globals()[params['env']] env = env_cls(**params['env_options']) vec_env = DummyVecEnv([lambda: env]) # Collect the info keywords. if len(args.info_keywords): info_keywords = args.info_keywords.split(',') else: info_keywords = [] # Report the data over a number of random initializations. iters = 5 for i in range(iters): print('Iteration: {}'.format(i)) # Create a random environment. if params['alg'] == 'PPO2': model = PPO2(params['policy_type'], vec_env,
import gym import numpy as np from stable_baselines.sac.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines import SAC import gym_ur5_gripper import ray ray.init() env = gym.make('UR5Gripper-v0') env.render('human') env = DummyVecEnv([lambda: env]) model = SAC(MlpPolicy, env, verbose=1) @ray.remote() def sac_learn(): model.learn(total_timesteps=500000, log_interval=100) sac_learn.remote() model.save("sac_ur5_gripper") del model # remove to demonstrate saving and loading model = SAC.load("sac_ur5_gripper")
from stable_baselines.common.vec_env import DummyVecEnv, SubprocVecEnv from stable_baselines import DQN from absl import flags FLAGS = flags.FLAGS FLAGS(['']) name = "dqn_mlp_std_simple" learn_type = 'DQN' start_value = 0 # create vectorized environment env = DummyVecEnv([lambda: CustomAgent(learn_type=learn_type)]) model = DQN(MlpPolicy, env, learning_rate=0.3, exploration_fraction=0.2, double_q=True, verbose=0, tensorboard_log="gym_ouput/" + name + "/log/") model.setup_model() if start_value > 0: try: model.load("gym_ouput/" + name + "/it" + str(start_value + 1), env=env) print("\n\nOBS! this is not the latest NN load point\n\n")
x = x[len(x) - len(y):] fig = plt.figure(title) plt.plot(x, y) plt.xlabel('Number of Timesteps') plt.ylabel('Rewards') plt.title(title + " Learning Curve Smoothed") plt.show() if __name__ == "__main__": rospy.init_node('drone_gym') env_id = 'Crazyflie-v0' log_dir = 'models/hover/empty_world_small/finalVec' env = DummyVecEnv([lambda: gym.make(env_id)]) # Automatically normalize the input features and reward env = VecNormalize(env, norm_obs=True, norm_reward=True) # # Save best model every n steps and monitors performance # save_best_callback = SaveOnBestTrainingRewardCallback(check_freq=5, log_dir=log_dir) # # Save model every n steps # checkpoint_callback = CheckpointCallback(save_freq=5, save_path='./' + log_dir, name_prefix='ppo2') # Train from scratch model = PPO2(MlpPolicy, env, verbose=1) model.learn(total_timesteps=80000) # model.learn(total_timesteps=20, callback=[save_best_callback, checkpoint_callback]) # Don't forget to save the VecNormalize statistics when saving the agent model.save(log_dir + "/ppo2_final")
def create_test_env(env_id, n_envs=1, is_atari=False, stats_path=None, seed=0, log_dir='', should_render=True, hyperparams=None): """ Create environment for testing a trained agent :param env_id: (str) :param n_envs: (int) number of processes :param is_atari: (bool) :param stats_path: (str) path to folder containing saved running averaged :param seed: (int) Seed for random number generator :param log_dir: (str) Where to log rewards :param should_render: (bool) For Pybullet env, display the GUI :param env_wrapper: (type) A subclass of gym.Wrapper to wrap the original env with :param hyperparams: (dict) Additional hyperparams (ex: n_stack) :return: (gym.Env) """ # HACK to save logs if log_dir is not None: os.environ["OPENAI_LOG_FORMAT"] = 'csv' os.environ["OPENAI_LOGDIR"] = os.path.abspath(log_dir) os.makedirs(log_dir, exist_ok=True) logger.configure() if hyperparams is None: hyperparams = {} # Create the environment and wrap it if necessary env_wrapper = get_wrapper_class(hyperparams) if 'env_wrapper' in hyperparams.keys(): del hyperparams['env_wrapper'] if is_atari: print("Using Atari wrapper") env = make_atari_env(env_id, num_env=n_envs, seed=seed) # Frame-stacking with 4 frames env = VecFrameStack(env, n_stack=4) elif n_envs > 1: # start_method = 'spawn' for thread safe env = SubprocVecEnv([make_env(env_id, i, seed, log_dir, wrapper_class=env_wrapper) for i in range(n_envs)]) # Pybullet envs does not follow gym.render() interface elif "Bullet" in env_id: # HACK: force SubprocVecEnv for Bullet env env = SubprocVecEnv([make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper)]) else: env = DummyVecEnv([make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper)]) # Load saved stats for normalizing input and rewards # And optionally stack frames if stats_path is not None: if hyperparams['normalize']: print("Loading running average") print("with params: {}".format(hyperparams['normalize_kwargs'])) env = VecNormalize(env, training=False, **hyperparams['normalize_kwargs']) if os.path.exists(os.path.join(stats_path, 'vecnormalize.pkl')): env = VecNormalize.load(os.path.join(stats_path, 'vecnormalize.pkl'), env) # Deactivate training and reward normalization env.training = False env.norm_reward = False else: # Legacy: env.load_running_average(stats_path) n_stack = hyperparams.get('frame_stack', 0) if n_stack > 0: print("Stacking {} frames".format(n_stack)) env = VecFrameStack(env, n_stack) return env
def test_discrete_twozone_engine_with_delay(self): """Does the DiscreteTwoZoneEngine with injection delay work as expected?""" # Initialize engine eng = engines.DiscreteTwoZoneEngine( nsteps=101, fuel="PRF100", rxnmech="llnl_gasoline_surrogate_323.xml", mdot=0.1, max_minj=5e-5, injection_delay=0.0025, ename="Isooctane_MBT_DI_50C_Summ.xlsx", reward=rw.Reward(negative_reward=-101.0), ) env = DummyVecEnv([lambda: eng]) variables = eng.observables + eng.internals + eng.histories df = pd.DataFrame( columns=list( dict.fromkeys( variables + eng.action.actions + ["rewards"] + eng.reward.get_rewards() ) ) ) # Evaluate a dummy agent that injects at a fixed time t0 = time.time() done = False cnt = 0 obs = env.reset() df.loc[cnt, variables] = [eng.current_state[k] for k in variables] df.loc[cnt, eng.action.actions] = 0 rwd = list( eng.reward.compute(eng.current_state, eng.nsteps, False, False).values() ) df.loc[cnt, eng.reward.get_rewards()] = rwd df.loc[cnt, ["rewards"]] = [sum(rwd)] while not done: cnt += 1 # Agent tries to inject thrice, but is not allowed the second time action = ( [1] if (eng.current_state["ca"] == -10) or eng.current_state["ca"] == 10 or eng.current_state["ca"] == 16 else [0] ) obs, reward, done, info = env.step(action) df.loc[cnt, variables] = [info[0]["current_state"][k] for k in variables] df.loc[cnt, eng.action.actions] = eng.action.current df.loc[cnt, ["rewards"]] = reward df.loc[cnt, eng.reward.get_rewards()] = list(info[0]["rewards"].values()) for rwd in eng.reward.get_rewards() + ["rewards"]: df[f"cumulative_{rwd}"] = np.cumsum(df[rwd]) elapsed = time.time() - t0 utilities.plot_df(env, df, idx=5, name="DiscreteTwoZone (delay)") # Test npt.assert_allclose(np.linalg.norm(df.V), 0.002205916821815495) npt.assert_allclose(np.linalg.norm(df.p), 35142241.61422163) npt.assert_allclose(np.linalg.norm(df["T"]), 20971.07323643) npt.assert_allclose(np.linalg.norm(df.rewards), 153.11736491) npt.assert_allclose(np.linalg.norm(df.mdot), 0.14142136) print(f"Wall time for DiscreteTwoZoneEngine with delay = {elapsed} seconds")
def test_reactor_engine_with_complex_reward(self): """Does the ReactorEngine with complex reward work as expected?""" # Initialize engine reward = rw.Reward( names=["work", "nox", "soot"], norms=[1.0, 5e-8, 1e-9], weights=[0.34, 0.33, 0.33], negative_reward=-100.0, randomize=False, ) eng = engines.ReactorEngine( nsteps=101, Tinj=300.0, rxnmech="dodecane_lu_nox.cti", mdot=0.1, max_minj=5e-5, ename="Isooctane_MBT_DI_50C_Summ.xlsx", reward=reward, ) env = DummyVecEnv([lambda: eng]) variables = eng.observables + eng.internals + eng.histories df = pd.DataFrame( columns=list( dict.fromkeys( variables + eng.action.actions + ["rewards"] + eng.reward.get_rewards() ) ) ) # Evaluate a dummy agent that injects at a fixed time t0 = time.time() done = False cnt = 0 obs = env.reset() df.loc[cnt, variables] = [eng.current_state[k] for k in variables] df.loc[cnt, eng.action.actions] = 0 rwd = list( eng.reward.compute(eng.current_state, eng.nsteps, False, False).values() ) df.loc[cnt, eng.reward.get_rewards()] = rwd df.loc[cnt, ["rewards"]] = [sum(rwd)] while not done: cnt += 1 # Agent tries to inject twice, but is not allowed the second time action = ( [1] if (eng.current_state["ca"] == 0) or eng.current_state["ca"] == 2 else [0] ) obs, reward, done, info = env.step(action) df.loc[cnt, variables] = [info[0]["current_state"][k] for k in variables] df.loc[cnt, eng.action.actions] = eng.action.current df.loc[cnt, ["rewards"]] = reward df.loc[cnt, eng.reward.get_rewards()] = list(info[0]["rewards"].values()) for rwd in eng.reward.get_rewards() + ["rewards"]: df[f"cumulative_{rwd}"] = np.cumsum(df[rwd]) elapsed = time.time() - t0 utilities.plot_df(env, df, idx=6, name="reactor") # Test npt.assert_allclose(np.linalg.norm(df.V), 0.002205916821815495) npt.assert_allclose(np.linalg.norm(df.p), 34254670.52877185, rtol=1e-5) npt.assert_allclose(np.linalg.norm(df["T"]), 18668.46491609, rtol=1e-5) npt.assert_allclose(np.linalg.norm(df.rewards), 54.47632708, rtol=1e-5) npt.assert_allclose(np.linalg.norm(df.r_work), 53.47224436, rtol=1e-5) npt.assert_allclose(np.linalg.norm(df.r_nox), 14.10312665, rtol=1e-5) npt.assert_allclose(np.linalg.norm(df.w_work), 3.41695771, rtol=1e-5) npt.assert_allclose(np.linalg.norm(df.w_nox), 3.31645895, rtol=1e-5) npt.assert_allclose(np.linalg.norm(df.w_soot), 3.31645895, rtol=1e-5) npt.assert_allclose(np.linalg.norm(df.mdot), 0.14142135623730953) print(f"Wall time for ReactorEngine (complex reward) = {elapsed} seconds")
def single_run(self, folder_path, num_evals, policy_kwargs=None, is_baseline=False, baseline_policy=None): # initialize cProfile profiler_object = cProfile.Profile() profiler_object.enable() config = configparser.ConfigParser() config.read('gym_config/config.ini') rl_time_steps = config.getint('rl', 'time_steps') ent_coef = config.getfloat('rl', 'ent_coef') n_steps = config.getint('rl', 'n_steps') nminibatches = config.getint('rl', 'nminibatches') noptepochs = config.getint('rl', 'noptepochs') learning_rate = config.getfloat('rl', 'learning_rate') time_steps = config.getint('garden', 'time_steps') step = config.getint('garden', 'step') num_plants_per_type = config.getint('garden', 'num_plants_per_type') num_plant_types = config.getint('garden', 'num_plant_types') garden_x = config.getint('garden', 'X') garden_y = config.getint('garden', 'Y') garden_z = 2 * config.getint( 'garden', 'num_plant_types' ) + 1 # Z axis contains a matrix for every plant type plus one for water levels. sector_width = config.getint('garden', 'sector_width') sector_height = config.getint('garden', 'sector_height') action_low = config.getfloat('action', 'low') action_high = config.getfloat('action', 'high') obs_low = config.getint('obs', 'low') obs_high = config.getint('obs', 'high') env = gym.make( 'simalphagarden-v0', wrapper_env=SimAlphaGardenWrapper(time_steps, garden_x, garden_y, sector_width, sector_height, num_plant_types, num_plants_per_type, step=step), garden_x=garden_x, garden_y=garden_y, garden_z=garden_z, sector_width=sector_width, sector_height=sector_height, action_low=action_low, action_high=action_high, obs_low=obs_low, obs_high=obs_high, ) env = DummyVecEnv([lambda: env]) # TODO: Normalize input features? VecNormalize env = VecCheckNan(env, raise_exception=False) if is_baseline: copyfile('gym_config/config.ini', folder_path + '/config.ini') # Evaluate baseline on 50 random environments of same parameters. self.evaluate_policy(folder_path, num_evals, env, garden_x, garden_y, sector_width, sector_height, is_baseline=True, baseline_policy=baseline_policy, step=1) # Graph evaluations self.graph_utils.graph_evaluations(folder_path, garden_x, garden_y, time_steps, step, num_evals, num_plant_types) else: pathlib.Path(folder_path + '/ppo_v2_tensorboard').mkdir( parents=True, exist_ok=True) # Instantiate the agent model = PPO2(CustomCnnPolicy, env, policy_kwargs=policy_kwargs, ent_coef=ent_coef, n_steps=n_steps, nminibatches=nminibatches, noptepochs=noptepochs, learning_rate=learning_rate, verbose=1, tensorboard_log=folder_path + '/ppo_v2_tensorboard/') # model = PPO2(MlpPolicy, env, ent_coef=ent_coef, n_steps=n_steps, nminibatches=nminibatches, noptepochs=noptepochs, learning_rate=learning_rate, verbose=1, tensorboard_log=folder_path + '/ppo_v2_tensorboard/') # Train the agent model.learn( total_timesteps=rl_time_steps ) # this will crash explaining that the invalid value originated from the env model.save(folder_path + '/model') copyfile('gym_config/config.ini', folder_path + '/config.ini') # Evaluate model on 50 random environments of same parameters. self.evaluate_policy(folder_path, num_evals, env, garden_x, garden_y, sector_width, sector_height, is_baseline=False) # Graph evaluations # self.graph_utils.graph_evaluations(folder_path, garden_x, garden_y, time_steps, step, num_evals, num_plant_types) profiler_object.disable() # dump the profiler stats s = io.StringIO() ps = pstats.Stats(profiler_object, stream=s).sort_stats('cumulative') pathlib.Path(folder_path + '/Timings').mkdir(parents=True, exist_ok=True) ps.dump_stats(folder_path + '/Timings/dump.txt') # convert to human readable format out_stream = open(folder_path + '/Timings/time.txt', 'w') ps = pstats.Stats(folder_path + '/Timings/dump.txt', stream=out_stream) ps.strip_dirs().sort_stats('cumulative').print_stats()
def main(): """ Prepare for trainings """ log_dir, model_dir = prepare_dirs() model_name = model_dir + '/' + MODEL_NAME print(f'model will be saved as {model_name}') log_dir = log_dir + '/' + MODEL_NAME """ Generate & Check environment """ env_name = ENV_NAME env = gym.make(env_name) # print(f'Observation space: {env.observation_space}') # print(f'Action space: {env.action_space}') # env = Monitor(env, log_dir, allow_early_resets=True) # check_env(env) """ Save config as pickle file """ config = summarize_config(env) save_config(log_dir, config) """ Vectorize environment """ num_envs = NUM_ENVS env = DummyVecEnv([lambda: env for _ in range(num_envs)]) # For training eval_env = DummyVecEnv([lambda: gym.make(env_name)]) # For evaluation """ Define checkpoint callback """ checkpoint_callback = CheckpointCallback(save_freq=SAVE_FREQ, save_path=model_name, name_prefix=MODEL_NAME) """ Use deterministic actions for evaluation callback """ eval_callback = EvalCallback(eval_env, best_model_save_path=model_name, log_path=log_dir, eval_freq=EVAL_FREQ, deterministic=True, render=False, n_eval_episodes=N_EVAL_EPISODES) print(f'Algorithm: {ALGORITHM}\n') if not CONTINUAL_LEARNING: """ Define model """ model = define_model(env, log_dir) else: model = load_model(env, model_dir, log_dir) """ Evaluate model before training """ # mean_reward, std_reward = evaluate_policy(model=model, # env=eval_env, # n_eval_episodes=N_EVAL_EPISODES) # print(f'Before training: mean reward: {mean_reward:.2f} +/- {std_reward:.2f}') """ Train model """ model.learn(total_timesteps=MAX_STEPS, callback=[checkpoint_callback, eval_callback]) """ Evaluate model after training """ # mean_reward, std_reward = evaluate_policy(model=model, # env=eval_env, # n_eval_episodes=N_EVAL_EPISODES) # print(f'After training: mean reward: {mean_reward:.2f} +/- {std_reward:.2f}') """ Save trained model """ model.save(model_name) """ Test trained model """ obs = eval_env.reset() for i in range(N_EVAL_EPISODES): action, _states = model.predict(obs) obs, rewards, dones, info = eval_env.step(action) eval_env.render() env.close() eval_env.close()
def load_model(algorithm, gym_env_id): global best_mean_reward model = None multiprocess = False num_cpu = 4 # Number of processes to use in multiprocess env env = None if multiprocess: env = SubprocVecEnv([make_env(gym_env_id, i) for i in range(num_cpu)]) else: gym_env = gym.make(gym_env_id) monitor_file_path = log_dir + current_time_string + "-monitor.csv" env = Monitor(gym_env, monitor_file_path, allow_early_resets=True) # vectorized environments allow to easily multiprocess training # we demonstrate its usefulness in the next examples env = DummyVecEnv([ lambda: env ]) # The algorithms require a vectorized environment to run existing_pickle_files = utils.get_files_with_pattern( pickle_dir, r'(.*)' + algorithm + "-best-model.pkl") # Sort files in reverse alphabetical order, so that models with newer dates are chosen first. existing_pickle_files.sort(reverse=True) for file_name in existing_pickle_files: search = re.search(r'(.*)' + algorithm + "-best-model.pkl", file_name) if search: if algorithm == 'deepq': model = DQN.load(file_name, env=env, verbose=verbose_level, tensorboard_log=tensorboard_dir) elif algorithm == 'ppo2': model = PPO2.load(file_name, env=env, verbose=verbose_level, tensorboard_log=tensorboard_dir) else: raise Exception( "Algorithm not supported: {}".format(algorithm)) logger.info( "Loading existing pickle file '{}' for environment {} with algorithm {} and policy '{}'." .format(file_name, gym_env_id, algorithm, model.policy)) logger.info( "Searching for previous best mean reward of algorithm '{}'...". format(algorithm)) best_mean_reward = get_best_mean_reward_from_results() if best_mean_reward != -np.inf: logger.info("Found previous best mean reward: {}".format( best_mean_reward)) else: logger.info( "Could not find previous best mean reward. Starting with: {}" .format(best_mean_reward)) return model logger.info( "No pickle was found for environment {}. Creating new model with algorithm {} and policy 'MlpPolicy'..." .format(gym_env_id, algorithm)) if algorithm == 'deepq': model = DQN(policy='MlpPolicy', env=env, verbose=verbose_level, tensorboard_log=tensorboard_dir) if algorithm == 'ppo2': model = PPO2(policy='MlpPolicy', env=env, verbose=verbose_level, tensorboard_log=tensorboard_dir) return model
env_name = "AirSimNH-v0" if env_name in gym.envs.registry.env_specs: del gym.envs.registry.env_specs[env_name] # register environment gym.register(id=env_name, entry_point=envs.AirSimSimplifiedActionMetaRLEnv.AirSimEnv) from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, LstmPolicy, register_policy from stable_baselines.common.vec_env import DummyVecEnv, VecNormalize from stable_baselines import PPO2 env = gym.make("AirSimNH-v0") # Vectorized environments allow to easily multiprocess training # we demonstrate its usefulness in the next examples env = DummyVecEnv([lambda: env ]) # The algorithms require a vectorized environment to run env = VecNormalize(env, norm_obs=True, norm_reward=True) # Meta RL basically uses an LSTM policy model = PPO2(MlpLstmPolicy, env, nminibatches=1, verbose=1, tensorboard_log="./ppo_tensorboard/") #model = PPO2.load("save_models/ppo_lidar_simplified_fixed_99", env=env, verbose=1, tensorboard_log="./ppo_tensorboard/") # Train the agent for i in range(0, 100): model.learn(total_timesteps=10000, tb_log_name="lidar_metarl_") # Save trained model model.save("ppo_lidar_metarl_" + str(i))
n_timesteps = args.n_timesteps else: n_timesteps = int(hyperparams['n_timesteps']) del hyperparams['n_timesteps'] normalize = False normalize_kwargs = {} if 'normalize' in hyperparams.keys(): normalize = hyperparams['normalize'] if isinstance(normalize, str): normalize_kwargs = eval(normalize) normalize = True del hyperparams['normalize'] if not args.teleop: env = DummyVecEnv([make_env(args.seed, vae=vae, teleop=args.teleop)]) else: env = make_env(args.seed, vae=vae, teleop=args.teleop, n_stack=hyperparams.get('frame_stack', 1))() if normalize: if hyperparams.get('normalize', False) and args.algo in ['ddpg']: print("WARNING: normalization not supported yet for DDPG") else: print("Normalizing input and return") env = VecNormalize(env, **normalize_kwargs) # Optional Frame-stacking n_stack = 1
def train(algo, df, model_name, uniqueId, lr=None, gamma=None, noBacktest=1, cutoff_date=None, commission=0, addTA='N'): before = np.zeros(noBacktest) after = np.zeros(noBacktest) backtest = np.zeros(noBacktest) train_dates = np.empty(noBacktest, dtype="datetime64[s]") start_test_dates = np.empty(noBacktest, dtype="datetime64[s]") end_test_dates = np.empty(noBacktest, dtype="datetime64[s]") # print(str(df.columns.tolist())) dates = np.unique(df.date) logfile = "./log/" print("noBacktest", noBacktest) # backtest=1 uses cut of date to split train/test cutoff_date = np.datetime64(cutoff_date) print("cutoff_date", cutoff_date) if noBacktest == 1: a = np.where(dates <= cutoff_date)[0] b = np.where(dates > cutoff_date)[0] s = [] s.append((a, b)) else: # ref https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html splits = TimeSeriesSplit(n_splits=noBacktest) s = splits.split(dates) loop = 0 for train_date_index, test_date_index in s: print("loop", loop) train = df[df.date.isin(dates[train_date_index])] test = df[df.date.isin(dates[test_date_index])] runtimeId = uniqueId + "_" + str(loop) train_dates[loop] = max(train.date) start_test_dates[loop] = min(test.date) end_test_dates[loop] = max(test.date) n_actions = 1 action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) global env title = runtimeId + "_Train lr=" + \ str(lr) + ", cliprange=" + str(cliprange) + ", commission=" + str(commission) env = DummyVecEnv([ lambda: StockEnvPlayer(train, logfile + runtimeId + ".csv", title, seed=seed, commission=commission, addTA=addTA) ]) # Automatically normalize the input features env = VecNormalize(env, norm_obs=True, norm_reward=False, clip_obs=10.) model = algo( MlpPolicy, env, seedy=seed, gamma=g, n_steps=128, ent_coef=0.01, learning_rate=lr, vf_coef=0.5, max_grad_norm=0.5, lam=0.95, nminibatches=4, noptepochs=4, cliprange=cliprange, cliprange_vf=None, # tensorboard_log="./tensorlog", _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False, ) # Random Agent, before training print("\n*** Agent before learning ***") steps = len(np.unique(train.date)) before[loop] = evaluate(model, num_steps=steps) model.learn(total_timesteps=round(steps)) print("\n*** Evaluate the trained agent ***") after[loop] = evaluate(model, num_steps=steps) print("\n*** Run agent on unseen data ***") title = runtimeId + "_Test lr=" + \ str(lr) + ", cliprange=" + str(cliprange) + ", commission=" + str(commission) env = DummyVecEnv([ lambda: StockEnvPlayer(test, logfile + runtimeId + ".csv", title, seed=seed, commission=commission, addTA=addTA) ]) env = VecNormalize(env, norm_obs=True, norm_reward=False, clip_obs=10.) steps = len(np.unique(test.date)) backtest[loop] = evaluate(model, num_steps=steps) del model env.close() loop += 1 # display result on screen for i in range(noBacktest): print("\ntrain_dates:", min(df.date), train_dates[i]) print("test_dates:", start_test_dates[i], end_test_dates[i]) print( "backtest {} : SUM reward : before | after | backtest : {: 8.2f} | {: 8.2f} | {: 8.2f}" .format(i, before[i], after[i], backtest[i])) return pd.DataFrame({ "Model": uniqueId, "addTA": addTA, "Columns": str(df.columns.tolist()), "commission": commission, "Seed": seed, "cliprange": cliprange, "learningRate": lr, "gamma": g, "backtest # ": np.arange(noBacktest), "StartTrainDate": min(train.date), "EndTrainDate": train_dates, "before": before, "after": after, "testDate": end_test_dates, "Sum Reward@roadTest": backtest })
def test_equilibrate_engine(self): """Does the EquilibrateEngine work as expected?""" # Initialize engine eng = engines.EquilibrateEngine( nsteps=101, Tinj=300.0, rxnmech="dodecane_lu_nox.cti", mdot=0.1, max_minj=5e-5, ename="Isooctane_MBT_DI_50C_Summ.xlsx", reward=rw.Reward(negative_reward=-0.05), ) env = DummyVecEnv([lambda: eng]) variables = eng.observables + eng.internals + eng.histories df = pd.DataFrame( columns=list( dict.fromkeys( variables + eng.action.actions + ["rewards"] + eng.reward.get_rewards() ) ) ) # Evaluate a dummy agent that injects at a fixed time t0 = time.time() done = False cnt = 0 obs = env.reset() df.loc[cnt, variables] = [eng.current_state[k] for k in variables] df.loc[cnt, eng.action.actions] = 0 rwd = list( eng.reward.compute(eng.current_state, eng.nsteps, False, False).values() ) df.loc[cnt, eng.reward.get_rewards()] = rwd df.loc[cnt, ["rewards"]] = [sum(rwd)] while not done: cnt += 1 # Agent tries to inject twice, but is not allowed the second time action = ( [1] if (eng.current_state["ca"] == -10) or eng.current_state["ca"] == 10 else [0] ) obs, reward, done, info = env.step(action) df.loc[cnt, variables] = [info[0]["current_state"][k] for k in variables] df.loc[cnt, eng.action.actions] = eng.action.current df.loc[cnt, ["rewards"]] = reward df.loc[cnt, eng.reward.get_rewards()] = list(info[0]["rewards"].values()) for rwd in eng.reward.get_rewards() + ["rewards"]: df[f"cumulative_{rwd}"] = np.cumsum(df[rwd]) elapsed = time.time() - t0 utilities.plot_df(env, df, idx=4, name="EQ") # Test npt.assert_allclose(np.linalg.norm(df.V), 0.002205916821815495) npt.assert_allclose(np.linalg.norm(df.p), 35436062.48197973) npt.assert_allclose(np.linalg.norm(df["T"]), 12491.93935531) npt.assert_allclose(np.linalg.norm(df.rewards), 118.62610333) npt.assert_allclose(np.linalg.norm(df.mdot), 0.14142136) print(f"Wall time for EquilibrateEngine = {elapsed} seconds")
img = cv2.cvtColor(img, cv2.COLOR_RGBA2RGB) img = cv2.resize(img, dsize=(224, 224)) img = np.rollaxis(img, 2, 0) return np.resize(img, new_shape=(1, 3, 224, 224)) # def predict(img): # with tf.device('/device:gpu:1'): # ans = roadDetection.predict(np.array([img])) # return ans # random.seed(42) env = gym.make("CarRacing-v0") env.seed(42) env = DummyVecEnv([env]) obs = env.reset() # fileName = "50timesteps/carracing_episode_1200.pth" dones = False numEpisodes = 250000 inc = 1200 n_latent_var = 64 lr = 0.002 betas = (0.9, 0.999) gamma = 0.9 # discount factor K_epochs = 4 # update policy for K epochs eps_clip = 0.2 # clip parameter for PPO max_timesteps = 3000
def run_model_stablebaseline(flow_params, args, model_params=None): """Run the model for num_steps if provided. Parameters ---------- flow_params : Flow related parameters from config. args: Training arguments from parser. Returns ------- stable_baselines.* the trained model """ constructor = env_constructor(params=flow_params, version=0)() # The algorithms require a vectorized environment to run env = DummyVecEnv([lambda: constructor]) if model_params is None: if args.policy == 0: policy = MlpPolicy elif args.policy == 1: policy = LnMlpPolicy else: warnings.warn("Invalid policy type! Policy set to MlpPolicy.") policy = MlpPolicy dueling = None if args.dueling else dict(dueling=False) train_model = DQN( policy=policy, env=env, gamma=args.gamma, learning_rate=args.learning_rate, buffer_size=args.buffer_size, exploration_fraction=args.exploration_fraction, exploration_final_eps=args.exploration_final_eps, exploration_initial_eps=args.exploration_initial_eps, train_freq=args.train_freq, batch_size=args.batch_size, double_q=args.double_q, learning_starts=args.learning_starts, target_network_update_freq=args.target_network_update_freq, prioritized_replay=args.prioritized_replay, prioritized_replay_alpha=args.prioritized_replay_alpha, prioritized_replay_beta0=args.prioritized_replay_beta0, prioritized_replay_beta_iters=args.prioritized_replay_beta_iters, prioritized_replay_eps=args.prioritized_replay_eps, param_noise=args.param_noise, policy_kwargs=dueling, verbose=args.verbose, tensorboard_log=args.tensorboard_log, full_tensorboard_log=args.full_tensorboard_log) else: train_model = DQN( policy=model_params["policy"], env=env, gamma=model_params["gamma"], learning_rate=model_params["learning_rate"], buffer_size=model_params["buffer_size"], exploration_fraction=model_params["exploration_fraction"], exploration_final_eps=model_params["exploration_final_eps"], exploration_initial_eps=model_params["exploration_initial_eps"], train_freq=model_params["train_freq"], batch_size=model_params["batch_size"], double_q=model_params["double_q"], learning_starts=model_params["learning_starts"], target_network_update_freq=model_params[ "target_network_update_freq"], prioritized_replay=model_params["prioritized_replay"], prioritized_replay_alpha=model_params["prioritized_replay_alpha"], prioritized_replay_beta0=model_params["prioritized_replay_beta0"], prioritized_replay_beta_iters=model_params[ "prioritized_replay_beta_iters"], prioritized_replay_eps=model_params["prioritized_replay_eps"], param_noise=model_params["param_noise"], policy_kwargs=model_params["policy_kwargs"], verbose=model_params["verbose"], tensorboard_log=model_params["tensorboard_log"], full_tensorboard_log=model_params["full_tensorboard_log"]) train_model.learn(total_timesteps=args.num_steps) return train_model
def objective(params): """ Objective function to be minimized. Parameters ---------- * params [list, len(params)=n_hyperparameters] Settings of each hyperparameter for a given optimization iteration. - Controlled by hyperspaces's hyperdrive function. - Order preserved from list passed to hyperdrive's hyperparameters argument. """ config_path = join(path, 'rl', 'config', '{}.yml'.format(env_name)) with open(config_path) as f: config = yaml.safe_load(f) print('model loaded from path: {}'.format(config_path)) #set the parameters itrf, ccrf, opr, dr = params config['environment']['idle_time_reward_factor'] = itrf config['environment']['cycle_count_reward_factor'] = ccrf config['environment']['output_priming_reward'] = opr / 100 config['environment']['delivery_reward'] = dr print( 'Current settings for the config: \n\nidle_time_reward_factor \t:\t{}\ncycle_count_reward_factor\t\t:\t{}\n\ output_priming_reward\t\t\t:\t{}\ndelivery_reward\t\t\t:\t{}\n'.format( itrf, ccrf, opr / 100, dr)) #GET MODEL CONFIG model_config = config['models']['PPO2'] policy = config['main']['policy'] n_workers = config['main']['n_workers'] n_steps = config['main']['n_steps'] n_eval = (n_steps / 8) / 10 # load environment with config variables env_obj = getattr(rl.environments, env_name) env = env_obj(config) # multiprocess environment env_8 = make_vec_env(lambda: env, n_envs=n_workers) #define folder and path now = datetime.datetime.now() folder = '{}{}{}_{}{}'.format(now.year, str(now.month).zfill(2), str(now.day).zfill(2), str(now.hour).zfill(2), str(now.minute).zfill(2)) specified_path = join(path, 'rl', 'trained_models', env_name, 'hyper-parameter', '{}-{}{}{}{}'.format(folder, itrf, ccrf, opr, dr)) print('Results stored in: {}'.format(specified_path)) # callback for evaluation eval_callback = EvalCallback(env, best_model_save_path=specified_path, log_path=specified_path, eval_freq=n_eval, n_eval_episodes=5, verbose=0, deterministic=False, render=False) model = PPO2(policy, env=env_8, tensorboard_log=specified_path, **model_config) #LEARN MODEL model.learn(total_timesteps=n_steps, tb_log_name='{}_{}_{}_{}'.format(itrf, ccrf, opr, dr), callback=eval_callback) model_path = join(specified_path, 'model_{}_{}_{}_{}.zip'.format(itrf, ccrf, opr, dr)) model.save(model_path) #test best_modelpath = join(specified_path, 'best_model.zip') test_model = PPO2.load(best_modelpath, env=DummyVecEnv([lambda: env])) #run test of the model episodes = 10 results = {} results['cycle_count'] = 0 results['idle_time'] = 0 for episode in range(episodes): # Run an episode state = env.reset() done = False meta_data = [] while not done: action, _ = test_model.predict(state, deterministic=True) state, reward, done, _ = env.step(action) if done: results['cycle_count'] += env.cycle_count results['idle_time'] += sum(env.idle_times_operator.values()) return (results['cycle_count'] + results['idle_time']) / episodes
class StableBaselinesTradingStrategy(TradingStrategy): """A trading strategy capable of self tuning, training, and evaluating with stable-baselines. Arguments: environments: An instance of a trading environments for the agent to trade within. model: The RL model to create the agent with. Defaults to DQN. policy: The RL policy to train the agent's model with. Defaults to 'MlpPolicy'. model_kwargs: Any additional keyword arguments to adjust the model. kwargs: Optional keyword arguments to adjust the strategy. """ def __init__(self, environment: TradingEnvironment, model: BaseRLModel = DQN, policy: Union[str, BasePolicy] = 'MlpPolicy', model_kwargs: any = {}, **kwargs): self._model = model self._model_kwargs = model_kwargs self.environment = environment self._agent = self._model(policy, self._environment, **self._model_kwargs) @property def environment(self) -> 'TradingEnvironment': """A `TradingEnvironment` instance for the agent to trade within.""" return self._environment @environment.setter def environment(self, environment: 'TradingEnvironment'): self._environment = DummyVecEnv([lambda: environment]) def restore_agent(self, path: str): """Deserialize the strategy's learning agent from a file. Arguments: path: The `str` path of the file the agent specification is stored in. """ self._agent = self._model.load(path, self._environment, self._model_kwargs) def save_agent(self, path: str): """Serialize the learning agent to a file for restoring later. Arguments: path: The `str` path of the file to store the agent specification in. """ self._agent.save(path) def tune(self, steps: int = None, episodes: int = None, callback: Callable[[pd.DataFrame], bool] = None) -> pd.DataFrame: raise NotImplementedError def run( self, steps: int = None, episodes: int = None, episode_callback: Callable[[pd.DataFrame], bool] = None) -> pd.DataFrame: if steps is None and episodes is None: raise ValueError( 'You must set the number of `steps` or `episodes` to run the strategy.' ) steps_completed = 0 episodes_completed = 0 average_reward = 0 obs, state, dones = self._environment.reset(), None, [False] performance = {} while (steps is not None and (steps == 0 or steps_completed < steps)) or ( episodes is not None and episodes_completed < episodes): actions, state = self._agent.predict(obs, state=state, mask=dones) obs, rewards, dones, info = self._environment.step(actions) steps_completed += 1 average_reward -= average_reward / steps_completed average_reward += rewards[0] / (steps_completed + 1) exchange_performance = info[0].get('exchange').performance performance = exchange_performance if len( exchange_performance) > 0 else performance if dones[0]: if episode_callback is not None and not episode_callback( performance): break episodes_completed += 1 obs = self._environment.reset() print("Finished running strategy.") print("Total episodes: {} ({} timesteps).".format( episodes_completed, steps_completed)) print("Average reward: {}.".format(average_reward)) return performance
set_global_seeds(seed) return _init if __name__ == "__main__": worker_id = 10 num_env = 2 env_id = "/home/jim/projects/unity_ray/basic_env_linux/basic_env_linux" env = UnityEnv(env_id, worker_id=worker_id, use_visual=False) # Create log dir time_int = int(time.time()) log_dir = "stable_results/basic_env_{}/".format(time_int) os.makedirs(log_dir, exist_ok=True) env = Monitor(env, log_dir, allow_early_resets=True) env = DummyVecEnv([lambda: env]) # The algorithms require a vectorized environment to run #env = SubprocVecEnv([make_env(env_id, log_dir, i+worker_id) for i in range(num_env)]) model = TRPO(MlpPolicy, env, verbose=1) model.learn(total_timesteps=20000) model.save(log_dir+"model") #evaluate agent episodes = 100 ep_r = [] ep_l = [] for e in range(episodes): obs = env.reset() total_r = 0. total_l = 0. while True:
def environment(self, environment: 'TradingEnvironment'): self._environment = DummyVecEnv([lambda: environment])
register(id='simglucose-' + patient_id + '-v0', entry_point='simglucose.envs:T1DSimEnv', kwargs={'patient_name': env_id}) env = gym.make('simglucose-' + patient_id + '-v0') env.seed(seed) print(env_id) return env set_global_seeds(seed) return _init if __name__ == "__main__": env = DummyVecEnv([ make_env('adult#0{}'.format(str(i).zfill(2)), i) for i in range(1, 11) ]) # model = SAC(LnMlpPolicy, env, verbose=1) model = ACKTR(MlpLstmPolicy, env, verbose=1) model.learn(total_timesteps=256000) model.save("ACKTR_MlpLSTM_adult_def_reward") # for i,p in enumerate(child_options): # patient_id = p.split('#')[0] + str(i + 1) # register( # id='simglucose-' + patient_id + '-v0', # entry_point='simglucose.envs:T1DSimEnv', # kwargs={'patient_name': p} # )
parser.add_argument( '--save-freq', help='Save the model every n steps (if negative, no checkpoint)', default=-1, type=int) args = parser.parse_args() env_id = args.env n_timesteps = args.n_timesteps save_path = '{}_{}'.format(args.algo, env_id) # Instantiate and wrap the environment env = TimeFeatureWrapper(gym.make(env_id)) # Create the evaluation environment and callbacks eval_env = DummyVecEnv([lambda: TimeFeatureWrapper(gym.make(env_id))]) callbacks = [EvalCallback(eval_env, best_model_save_path=save_path)] # Save a checkpoint every n steps if args.save_freq > 0: callbacks.append( CheckpointCallback(save_freq=args.save_freq, save_path=save_path, name_prefix='rl_model')) algo = {'sac': SAC, 'td3': TD3}[args.algo] n_actions = env.action_space.shape[0] # Tuned hyperparameters from https://github.com/araffin/rl-baselines-zoo
def proba_step(self, obs, state=None, mask=None): return self.sess.run(self.policy_proba, {self.obs_ph: obs}) def value(self, obs, state=None, mask=None): return self.sess.run(self._value, {self.obs_ph: obs}) if __name__ == '__main__': rospy.init_node('segbot_collision_avoid', anonymous=True, log_level=rospy.WARN) # Create the Gym environment env = gym.make('HallwayCollision-v0') env = DummyVecEnv([lambda: env]) rospy.loginfo("Gym environment done") # Set the logging system rospack = rospkg.RosPack() pkg_path = rospack.get_path('haresh_segbot_hallway') # outdir = pkg_path + '/training_results' # env = wrappers.Monitor(env, outdir, force=True) # rospy.loginfo("Monitor Wrapper started") # Loads parameters from the ROS param server # Parameters are stored in a yaml file inside the config directory # They are loaded at runtime by the launch file env.reset()
with open( join(specified_path, 'Bestmodel_{}'.format(args.name), 'config.yml'), 'r') as f: config = yaml.safe_load(f) config_env = config['environment'] amount_of_actions = config_env['amount_output'] stop = amount_of_actions + 1 #load environment with config variables env_obj = getattr(rl.environments, args.environment) env = env_obj(config) modelpath = join(specified_path, 'Bestmodel_{}'.format(args.name), 'best_model.zip') model = PPO2.load(modelpath, env=DummyVecEnv([lambda: env])) print(args.render) for episode in range(10): # Run an episode state = env.reset() size = state.shape[0] done = False meta_data = [] while not done: action, _ = model.predict(state) logging.debug(model.action_probability(state)) if args.render: ## monitoring r = 4 state_n = state
def objective(trial): kwargs = hyperparams.copy() trial.model_class = None kwargs.update(sample_ppo2_params(trial)) def callback(_locals, _globals): """ Callback for monitoring learning progress. :param _locals: (dict) :param _globals: (dict) :return: (bool) If False: stop training """ self_ = _locals['self'] trial = self_.trial # Initialize variables if not hasattr(self_, 'is_pruned'): self_.is_pruned = False self_.last_mean_test_reward = -np.inf self_.last_time_evaluated = 0 self_.eval_idx = 0 if (self_.num_timesteps - self_.last_time_evaluated) < evaluate_interval: return True self_.last_time_evaluated = self_.num_timesteps # Evaluate the trained agent on the test env rewards = [] n_steps_done, reward_sum = 0, 0.0 # Sync the obs rms if using vecnormalize # NOTE: this does not cover all the possible cases if isinstance(self_.test_env, VecNormalize): self_.test_env.obs_rms = deepcopy(self_.env.obs_rms) self_.test_env.ret_rms = deepcopy(self_.env.ret_rms) # Do not normalize reward self_.test_env.norm_reward = False obs = self_.test_env.reset() while n_steps_done < n_test_steps: # Use default value for deterministic action, _ = self_.predict(obs) obs, reward, done, _ = self_.test_env.step(action) reward_sum += reward n_steps_done += 1 if done: rewards.append(reward_sum) reward_sum = 0.0 obs = self_.test_env.reset() rewards.append(reward_sum) mean_reward = np.mean(rewards) summary = tf.Summary(value=[tf.Summary.Value(tag='evaluation', simple_value=mean_reward)]) _locals['writer'].add_summary(summary, self_.num_timesteps) self_.last_mean_test_reward = mean_reward self_.eval_idx += 1 # report best or report current ? # report num_timesteps or elasped time ? trial.report(-1 * mean_reward, self_.eval_idx) # Prune trial if need if trial.should_prune(self_.eval_idx): self_.is_pruned = True return False return True commands = [[1,0],[2,0],[3,0]] env = SubprocVecEnv([lambda: e.AidaBulletEnv(commands, render = False, on_rack = False, default_reward = 2, height_weight = 5, orientation_weight = 3, direction_weight = 2, speed_weight = 5 ) for i in range(32)]) if(kwargs['normalize']): env = VecNormalize(env, clip_obs=1000.0, clip_reward=1000.0, gamma=kwargs['gamma']) model = PPO2(MlpPolicy, env, vf_coef = 0.5, max_grad_norm = 0.5, cliprange_vf = -1, verbose = 0, n_steps = kwargs['n_steps'], nminibatches = kwargs['nminibatches'], gamma = kwargs['gamma'], learning_rate = kwargs['learning_rate'], ent_coef = kwargs['ent_coef'], cliprange = kwargs['cliprange'], noptepochs = kwargs['noptepochs'], lam = kwargs['lam'], policy_kwargs = dict(layers=[100,100]), tensorboard_log = "./optimisation/logOPTI" ) model.test_env = DummyVecEnv([lambda: e.AidaBulletEnv(commands, render = False, on_rack = False, default_reward = 2, height_weight = 5, orientation_weight = 3, direction_weight = 2, speed_weight = 2 ) ]) if(kwargs['normalize']): model.test_env = VecNormalize(model.test_env, clip_obs=1000.0, clip_reward=1000.0, gamma=kwargs['gamma'],training=False, norm_reward=False) model.trial = trial try: model.learn(n_timesteps, callback=callback) # Free memory model.env.close() model.test_env.close() except AssertionError: # Sometimes, random hyperparams can generate NaN # Free memory model.env.close() model.test_env.close() raise is_pruned = False cost = np.inf if hasattr(model, 'is_pruned'): is_pruned = model.is_pruned cost = -1 * model.last_mean_test_reward try: os.mkdir("./optimisation/resultats/"+str(trial.number)) except FileExistsError: print("Directory already exists") if kwargs['normalize']: try: os.mkdir("./optimisation/resultats/"+str(trial.number)+"/normalizeData") except FileExistsError: print("Directory already exists") model.save("./optimisation/resultats/"+str(trial.number)+"/"+str(trial.number)) if kwargs['normalize']: model.env.save_running_average("./optimisation/resultats/"+str(trial.number) +"/normalizeData") del model.env, model.test_env del model if is_pruned: try: # Optuna >= 0.19.0 raise optuna.exceptions.TrialPruned() except AttributeError: raise optuna.structs.TrialPruned() return cost
LOGPATH = os.path.join(LOGDIR, LOGNAME + ".csv") MODELPATH = os.path.join(DIR, LOGNAME + "_ckpt") MODELPATH2 = os.path.join(DIR, "e2enavreptrainenv_latest_PPO_ckpt") if not os.path.exists(DIR): os.makedirs(DIR) if not os.path.exists(LOGDIR): os.makedirs(LOGDIR) MILLION = 1000000 TRAIN_STEPS = args.n if TRAIN_STEPS is None: TRAIN_STEPS = 60 * MILLION N_ENVS = 6 if args.debug: env = DummyVecEnv([lambda: E2ENavRepEnv(silent=True, scenario='train')]*N_ENVS) else: env = SubprocVecEnv([lambda: E2ENavRepEnv(silent=True, scenario='train')]*N_ENVS, start_method='spawn') eval_env = E2ENavRepEnv(silent=True, scenario='train') def test_env_fn(): # noqa return E2ENavRepEnv(silent=True, scenario='test') cb = NavrepEvalCallback(eval_env, test_env_fn=test_env_fn, logpath=LOGPATH, savepath=MODELPATH, verbose=1) model = PPO2(CustomPolicy, env, verbose=0) model.learn(total_timesteps=TRAIN_STEPS+1, callback=cb) obs = env.reset() model.save(MODELPATH) model.save(MODELPATH2) print("Model '{}' saved".format(MODELPATH))
import numpy as np import matplotlib.pyplot as plt from stable_baselines.common import set_global_seeds from stable_baselines.bench import Monitor from stable_baselines.results_plotter import load_results, ts2xy import json best_mean_reward, n_steps = -np.inf, 0 best_eval_mean_reward = -np.inf seed = 500 log_dir = "logs/mujoco/Hopper_skipq_"+str(seed)+ "/" os.makedirs(log_dir, exist_ok=True) log_data = {'dt':[],'eval':[],'train':[],'timesteps':[]} f = open(log_dir+"eval.txt", "w") set_global_seeds(seed) test_env = DummyVecEnv([lambda: gym.make("Hopper-v2")]) max_eval_timesteps = 5000 # Automatically normalize the input features # test_env = VecNormalize(test_env, norm_obs=True, norm_reward=False, # clip_obs=10.) def callback(_locals, _globals): """ Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2) :param _locals: (dict) :param _globals: (dict) """
# 'gamma': params['gamma'], # 'learning_rate': params['learning_rate'], # 'ent_coef': params['ent_coef'], # 'cliprange': params['cliprange'], # 'noptepochs': int(params['noptepochs']), # 'lam': params['lam'], # } # model = PPO2.load('./agents/ppo2_' + reward_strategy + '_' + str(curr_idx) + '_6' + '.pkl', env=test_env) while True: test_df = df_init[16400:] test_env = DummyVecEnv([ lambda: BitcoinTradingEnv(test_df, reward_func=reward_strategy, forecast_len=int(params['forecast_len']), confidence_interval=params[ 'confidence_interval']) ]) print('after while') df_init = pd.read_csv('binance.csv') # df = df.drop(['Symbol'], axis=1) df_init = df_init.sort_values(['Date']) df_init = add_indicators(df_init.reset_index()) test_len = int(len(df_init) * 0.021) train_len = int(len(df_init)) - test_len test_df = df_init[16385:] test_env = DummyVecEnv([ lambda: BitcoinTradingEnv(test_df,