def train(hours): conn = Connection() env = Monitor(SpireEnv(conn), "./tmp/") env.reset() logdir = "./tboard_log" try: model = MODEL_CLASS.load(MODEL_NAME, env=env, tensorboard_log=logdir) except FileNotFoundError: model = MODEL_CLASS(MlpPolicy, env, tensorboard_log=logdir, **KWARGS) start = time.time() steps_per_hour = 7000 steps = steps_per_hour * hours callback = TensorboardCallback(env) model.learn(total_timesteps=steps, reset_num_timesteps=False, callback=callback) model.save(MODEL_NAME) elapsed = time.time() - start print(f"{steps} steps processed") print(f"{timedelta(seconds=elapsed)} time elapsed") print(f"{env.total_floors} floors climbed") print(f"{env.total_games} games played") if env.total_games > 0: print("{:.2f} floors per game".format(env.total_floors / env.total_games))
def test_monitor(tmp_path): """ Test the monitor wrapper """ env = gym.make("CartPole-v1") env.seed(0) monitor_file = os.path.join( str(tmp_path), "stable_baselines-test-{}.monitor.csv".format(uuid.uuid4())) monitor_env = Monitor(env, monitor_file) monitor_env.reset() total_steps = 1000 ep_rewards = [] ep_lengths = [] ep_len, ep_reward = 0, 0 for _ in range(total_steps): _, reward, done, _ = monitor_env.step( monitor_env.action_space.sample()) ep_len += 1 ep_reward += reward if done: ep_rewards.append(ep_reward) ep_lengths.append(ep_len) monitor_env.reset() ep_len, ep_reward = 0, 0 monitor_env.close() assert monitor_env.get_total_steps() == total_steps assert sum(ep_lengths) == sum(monitor_env.get_episode_lengths()) assert sum(monitor_env.get_episode_rewards()) == sum(ep_rewards) _ = monitor_env.get_episode_times() with open(monitor_file, "rt") as file_handler: first_line = file_handler.readline() assert first_line.startswith("#") metadata = json.loads(first_line[1:]) assert metadata["env_id"] == "CartPole-v1" assert set(metadata.keys()) == {"env_id", "t_start" }, "Incorrect keys in monitor metadata" last_logline = pandas.read_csv(file_handler, index_col=None) assert set( last_logline.keys()) == {"l", "t", "r"}, "Incorrect keys in monitor logline" os.remove(monitor_file)
def train_ppo(itr=0, timesteps=1e7, use_dummy_video = True): env = flappy_env.FlappyEnv(use_dummy_video) env = Monitor(env, f"flappy_ppo_{itr}") obs = env.reset() model = PPO( "CnnPolicy", env, verbose=1, learning_rate=1e-5, tensorboard_log = f"./ppo_flappy_tensorboard_{itr}/") model.learn(total_timesteps = timesteps) model.save(f"ppo_flappy_{itr}")
def test_monitor_load_results(tmp_path): """ test load_results on log files produced by the monitor wrapper """ tmp_path = str(tmp_path) env1 = gym.make("CartPole-v1") env1.seed(0) monitor_file1 = os.path.join( tmp_path, "stable_baselines-test-{}.monitor.csv".format(uuid.uuid4())) monitor_env1 = Monitor(env1, monitor_file1) monitor_files = get_monitor_files(tmp_path) assert len(monitor_files) == 1 assert monitor_file1 in monitor_files monitor_env1.reset() episode_count1 = 0 for _ in range(1000): _, _, done, _ = monitor_env1.step(monitor_env1.action_space.sample()) if done: episode_count1 += 1 monitor_env1.reset() results_size1 = len(load_results(os.path.join(tmp_path)).index) assert results_size1 == episode_count1 env2 = gym.make("CartPole-v1") env2.seed(0) monitor_file2 = os.path.join( tmp_path, "stable_baselines-test-{}.monitor.csv".format(uuid.uuid4())) monitor_env2 = Monitor(env2, monitor_file2) monitor_files = get_monitor_files(tmp_path) assert len(monitor_files) == 2 assert monitor_file1 in monitor_files assert monitor_file2 in monitor_files monitor_env2.reset() episode_count2 = 0 for _ in range(1000): _, _, done, _ = monitor_env2.step(monitor_env2.action_space.sample()) if done: episode_count2 += 1 monitor_env2.reset() results_size2 = len(load_results(os.path.join(tmp_path)).index) assert results_size2 == (results_size1 + episode_count2) os.remove(monitor_file1) os.remove(monitor_file2)
def train_dqn(itr = 0, timesteps = 1e7, use_dummy_video = True): env = flappy_env.FlappyEnv(use_dummy_video) env = Monitor(env, f"flappy_dqn_{itr}") obs = env.reset() model = DQN( "CnnPolicy", env, verbose = 1, optimize_memory_usage = True, buffer_size = 500000, learning_rate = 1e-5, tensorboard_log = f"./dqn_flappy_tensorboard_{itr}/") model.learn(total_timesteps = timesteps) model.save(f"dqn_flappy_{itr}")
def test(model, test_images): test_env = Monitor( PuzzleEnv(images=test_images, img_size=IMG_SIZE, channel_num=CHANNEL_NUM, puzzle_size=(3, 3), puzzle_type="switch", dist_type="manhattan", penalty_for_step=-0.2, reward_for_completiton=20, positive_reward_coefficient=1.0, obs_conf=OBS_CONF)) solutions = [] rews = [] steps = [] sample = len(test_images) errors = 0 for iter in range(sample): i = 0 done = False obs = test_env.reset() frames = [obs] while not done: i += 1 action, _states = model.predict(obs) obs, rewards, done, info = test_env.step(action) frames.append(obs) rews.append(rewards) if i == 10000: errors += 1 break solutions.append(frames) done = False print(i, sum(rews), rews) rews = [] steps.append(i) print('Average steps taken: ', sum(steps) / sample) print('Median of steps taken: ', statistics.median(steps)) print('Number of errors: ', errors) plt.hist(steps, bins=9) plt.savefig('fig.png')
def learn(self, initial_models): mesa_algo = TD3( "MlpPolicy", self.env, verbose=1, learning_starts=1 ) # Note: Unecessarily initializes parameters (could speed up a bit by fixing)' mesa_algo.set_parameters(to_torch(initial_models), exact_match=False) LOG_DIR = "/home/jet/catkin_ws/src/marsha/marsha_ai/training/logs/" MODEL_DIR = "/home/jet/catkin_ws/src/marsha/marsha_ai/training/models/" callback_list = [] callback_list.append(TensorboardCallback()) callback_list.append( StopTrainingOnMaxEpisodes(max_episodes=5, verbose=1)) """callback_list.append(EvalCallback(self.env, best_model_save_path=MODEL_DIR, log_path=LOG_DIR, deterministic=True, eval_freq=5, n_eval_episodes=1))""" mesa_algo.learn(total_timesteps=1000, callback=callback_list ) #rospy.get_param("/hyperparameters/total_timesteps") print("finished training! Testing mesa network...") test_buffer = ReplayBuffer(100, TaskEnv.observation_space, TaskEnv.action_space, device="cuda") test_env = Monitor(self.env) done = False ob = test_env.reset() while not done: action, state = mesa_algo.predict(ob) next_ob, reward, done, info = test_env.step(action) test_buffer.add(ob, next_ob, action, reward, done, [info]) ob = next_ob meta_buffer = {"test": test_buffer, "train": mesa_algo.replay_buffer} optimized_mesa_parameters = mesa_algo.get_parameters() tf_mesa_models = from_torch(optimized_mesa_parameters) return meta_buffer, tf_mesa_models
def objective(trial): # gym environment & variables env = gym.make(env_id) # Parallel environments # env = make_vec_env(gym.make(env_id), n_envs=4) os.makedirs(logs_base_dir, exist_ok=True) env = Monitor(env, logs_base_dir) global episodes global mean_reward episodes = 0 mean_reward = 0 batch_size = trial.suggest_categorical( "batch_size", [8, 16, 32, 64, 128, 256, 512]) n_steps = trial.suggest_categorical( "n_steps", [256, 512, 1024, 2048, 4096]) gamma = trial.suggest_categorical( "gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]) learning_rate = trial.suggest_loguniform("lr", 2e-4, 6e-4) lr_schedule = "constant" ent_coef = trial.suggest_loguniform("ent_coef", 0.00000001, 0.1) clip_range = trial.suggest_categorical("clip_range", [0.1, 0.2, 0.3, 0.4]) n_epochs = trial.suggest_categorical("n_epochs", [1, 5, 10, 20]) gae_lambda = trial.suggest_categorical( "gae_lambda", [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0]) max_grad_norm = trial.suggest_categorical( "max_grad_norm", [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5]) vf_coef = trial.suggest_uniform("vf_coef", 0, 1) net_arch = trial.suggest_categorical("net_arch", ["small", "medium", "large"]) log_std_init = trial.suggest_uniform("log_std_init", -4, 1) sde_sample_freq = trial.suggest_categorical( "sde_sample_freq", [-1, 8, 16, 32, 64, 128, 256]) ortho_init = False ortho_init = trial.suggest_categorical('ortho_init', [False, True]) activation_fn = trial.suggest_categorical( 'activation_fn', ['tanh', 'relu', 'elu', 'leaky_relu']) net_arch = { "small": [dict(pi=[64, 64], vf=[64, 64])], "medium": [dict(pi=[128, 128], vf=[128, 128])], "large": [dict(pi=[256, 256], vf=[256, 256])], }[net_arch] activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU, "elu": nn.ELU, "leaky_relu": nn.LeakyReLU}[activation_fn] #Create the policy_kwargs #Create the model_kwargs #Create the callback #Store the policy_kwargs into log_tensorboard #Store the model_kwargs into log_tensorboard model = PPO( MlpPolicy, env, n_steps=n_steps, batch_size=batch_size, gamma=gamma, learning_rate=learning_rate, ent_coef=ent_coef, clip_range=clip_range, n_epochs=n_epochs, gae_lambda=gae_lambda, max_grad_norm=max_grad_norm, vf_coef=vf_coef, sde_sample_freq=sde_sample_freq, policy_kwargs=dict( log_std_init=log_std_init, net_arch=net_arch, activation_fn=activation_fn, ortho_init=ortho_init, ), tensorboard_log=log_tensorboard, verbose=0 ) # ======================================================================== Hyper Parameters # ======================================================================== Evaluation class RewardCallback(BaseCallback): """ Callback for saving a model (the check is done every ``check_freq`` steps) based on the training reward (in practice, we recommend using ``EvalCallback``). :param check_freq: (int) :param log_dir: (str) Path to the folder where the model will be saved. It must contains the file created by the ``Monitor`` wrapper. :param verbose: (int) """ def __init__(self, check_freq: int, log_dir: str, verbose=1): super(RewardCallback, self).__init__(verbose) self.check_freq = check_freq self.log_dir = log_dir self.save_path = os.path.join(log_dir, 'best_model') self.best_mean_reward = -np.inf def _init_callback(self) -> None: # Create folder if needed if self.save_path is not None: os.makedirs(self.save_path, exist_ok=True) def _on_step(self) -> bool: if self.n_calls % self.check_freq == 0: # Retrieve training reward x, y = ts2xy(load_results(self.log_dir), 'timesteps') if len(x) > 0: global episodes global mean_reward global best_reward episodes = len(y) # print(episodes) mean_reward = np.mean(y[-50:]) mean_reward = round(mean_reward, 0) if self.verbose > 0: print(f"Episodes: {episodes}") print(f"Num steps: {self.num_timesteps}") print(f"Mean reward: {mean_reward:.2f} ") print("=================================") # Report intermediate objective value to Optima and Handle pruning trial.report(mean_reward,self.num_timesteps) if trial.should_prune(): raise optuna.TrialPruned() # New best model, you could save the agent here if mean_reward > best_reward: best_reward = mean_reward if mean_reward > reward_threshold: print("REWARD ACHIVED") model.save(f"{self.save_path}/reward_achived_{str(mean_reward)}") return False else: model.save(f"{self.save_path}/best_model") # New best model, you could save the agent here # if episodes > episodes_threshold: # print("REWARD ACHIVED") # model.save(self.save_path) # return False return True # ======================================================================== Training check_freq = int(timesteps/10) if int(timesteps/10)>0 else 1 callback = RewardCallback(check_freq=check_freq, log_dir=logs_base_dir) model.learn(total_timesteps=int(timesteps), callback=callback) # ==== Rest environment del model env.reset() return mean_reward
verbose=1, buffer_size=int(1e6), learning_rate=1e-3, gamma=0.95, tensorboard_log="./her_overcooked", batch_size=256, online_sampling=online_sampling, action_noise=action_noise, # policy_kwargs=dict(net_arch=[256, 256, 256]), ) # model = HER.load('./her_bit_env250.zip', env=env) # Train the model for i in range(1000): model.learn(10000) model.save(f"./her_bit_env{i}") # model = HER.load('./her_bit_env', env=env) obs = env.reset() for _ in range(100): action, _ = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) env.render() episode_reward += reward if done or info.get("is_success", False): print("Reward:", episode_reward, "Success?", info.get("is_success", False)) episode_reward = 0.0 obs = env.reset()
class OffPolicy_BaseLine(RLSPAgent): """ RLSP DDPG Agent This class creates a DDPG agent with params for RLSP """ def __init__(self, agent_helper): self.agent_helper = agent_helper # create model #TODO: add number of env for multiple processing later for faster traing: self.create() pass def create(self, n_envs=1): """Create the agent""" self.env = self.agent_helper.env log_dir = self.agent_helper.config_dir os.makedirs(log_dir, exist_ok=True) self.env = Monitor(self.env, log_dir) #TODO: # Create DDPG policy and define its hyper parameter here! even the action space and observation space. # add policy policy_name = self.agent_helper.config['policy'] self.policy = eval(policy_name) # action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) n_actions = int(self.agent_helper.env.action_space.shape[0]) action_noise = NormalActionNoise( mean=np.zeros(n_actions), sigma=self.agent_helper.config['rand_sigma'] * np.ones(n_actions)) #FIXME: test: # self.model = DDPG("MlpPolicy", self.env, action_noise=action_noise, verbose=1, tensorboard_log=self.agent_helper.graph_path) # TODO: fix the obvervation space and action space later. Test if the obervation space input is correct? Output action space is correct? # activ_function_name = self.agent_helper.config['nn_activ'] # activ_function = eval(activ_function_name) # policy_kwargs = dict(activation_fn=activ_function, # net_arch=[dict(pi=[32, 32], qf=[32, 32])]) policy_kwargs = dict(net_arch=self.agent_helper.config['layers']) self.model = OffPolicyAlgorithm( self.policy, self.env, learning_rate=self.agent_helper.config['learning_rate'], buffer_size=self.agent_helper.config['buffer_size'], batch_size=self.agent_helper.config['batch_size'], tau=self.agent_helper.config['tau'], gamma=self.agent_helper.config['gamma'], gradient_steps=self.agent_helper.config['gradient_steps'], action_noise=action_noise, optimize_memory_usage=self.agent_helper. config['optimize_memory_usage'], create_eval_env=self.agent_helper.config['create_eval_env'], policy_kwargs=policy_kwargs, verbose=self.agent_helper.config['verbose'], learning_starts=self.agent_helper.config['learning_starts'], tensorboard_log=self.agent_helper.graph_path, seed=self.agent_helper.seed) pass def test_env(self): logger.info(f"Model: {self.model.get_env()}") def fit(self, env, episodes, verbose, episode_steps, callbacks, log_interval, agent_id=-1): """Mask the agent fit function To train the agent """ logger.info("herer") # self.model.learn(total_timesteps=100, log_interval=10) #FIXME: use the tb logname meaningful! #TODO: Write callback funcs here: # List of callback: # Checkpoint Callback: save the model every 10 episodes. checkpoint_callback = CheckpointCallback( save_freq=96, save_path=self.agent_helper.config_dir, name_prefix='rl_model') # Eval Callback: evaluate every eval_freq, save the best model to best_model_save_path. eval_env = env eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/', log_path='./logs/', eval_freq=500, deterministic=True, render=False) # StopTrainingOnRewardThreshold: stop the training on reward threshold, show that this is good enough callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=70, verbose=1) eval_callback_reward_threshold = EvalCallback( eval_env, callback_on_new_best=callback_on_best, verbose=1) # EveryNTimeSteps: to call every n time steps to save the model. checkpoint_on_event = CheckpointCallback(save_freq=1, save_path='./logs/') event_callback_after_n_steps = EveryNTimesteps( n_steps=500, callback=checkpoint_on_event) # StopTrainingOnMaxEpisodes: # Stops training when the model reaches the maximum number of episodes callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=5, verbose=1) # CallbackList: to call several callback together. callbacklist = CallbackList([checkpoint_callback, eval_callback]) logger.info(f"Model: {self.model.get_env()}") with ProgressBarManager(log_interval) as progress_callback: self.model.learn(total_timesteps=log_interval, callback=[progress_callback, checkpoint_callback]) # mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10) # self.eval_writer(mean_reward, std_reward) pass def test(self, env, episodes, verbose, episode_steps, callbacks, sim): """Mask the agent fit function""" logger.info(f"episodes: {episodes}, episode_steps: {episode_steps}") if self.agent_helper.train: # Create a fresh simulator with test argument logger.info("Create new Environment!") self.agent_helper.env.simulator = create_simulator( self.agent_helper) obs = self.env.reset() self.setup_writer() self.setup_run_writer() episode = 1 step = 0 episode_reward = 0.0 done = False # action, _states = self.model.predict(obs) # obs, reward, dones, info = self.env.step(action) # logger.info(f"info: {info}") # Test for 1 episode while not done: action, _states = self.model.predict(obs) obs, reward, dones, info = self.env.step(action) episode_reward += reward self.write_run_reward(step, reward) if sim: step = info['sim_time'] if step >= (self.agent_helper.episode_steps * self.agent_helper.n_steps_per_episode): done = True self.write_reward(episode, episode_reward) else: step = info['step'] if step >= self.agent_helper.episode_steps: done = True self.write_reward(episode, episode_reward) # episode += 1 # # sys.stdout.write( # "\rTesting:" + # f"Current Simulator Time: {step}. Testing duration: {self.agent_helper.episode_steps}\n") # sys.stdout.flush() # print("") pass def save_weights(self, file, overwrite=True): weights_file = f"{file}weights" dir_path = os.path.dirname(os.path.realpath(weights_file)) os.makedirs(dir_path, exist_ok=True) # After training is done, we save the final weights in the result_base_path. logger.info("saving model and weights to %s", weights_file) # self.agent.save_weights(weights_file, overwrite) self.model.save(weights_file) pass def load_weights(self, weights_file): """ Load the model from a zip archive """ self.model = OffPolicyAlgorithm.load(weights_file) pass def setup_writer(self): episode_reward_filename = f"{self.agent_helper.config_dir}/episode_reward.csv" episode_reward_header = ['episode', 'reward'] self.episode_reward_stream = open(episode_reward_filename, 'a+', newline='') self.episode_reward_writer = csv.writer(self.episode_reward_stream) self.episode_reward_writer.writerow(episode_reward_header) def setup_run_writer(self): run_reward_filename = f"{self.agent_helper.config_dir}/run_reward.csv" run_reward_header = ['run', 'reward'] self.run_reward_stream = open(run_reward_filename, 'a+', newline='') self.run_reward_writer = csv.writer(self.run_reward_stream) self.run_reward_writer.writerow(run_reward_header) def write_reward(self, episode, reward): self.episode_reward_writer.writerow([episode, reward]) def write_run_reward(self, step, reward): self.run_reward_writer.writerow([step, reward]) def eval_writer(self, mean_reward, std_reward): episode_reward_filename = f"{self.agent_helper.config_dir}evaluate_agent.csv" episode_reward_header = ['mean_reward', 'std_reward'] self.episode_reward_stream = open(episode_reward_filename, 'a+', newline='') self.episode_reward_writer = csv.writer(self.episode_reward_stream) self.episode_reward_writer.writerow(episode_reward_header) self.episode_reward_writer.writerow([mean_reward, std_reward]) def eval_writer(self, mean_reward, std_reward): episode_reward_filename = f"{self.agent_helper.config_dir}evaluate_agent.csv" episode_reward_header = ['mean_reward', 'std_reward'] self.episode_reward_stream = open(episode_reward_filename, 'a+', newline='') self.episode_reward_writer = csv.writer(self.episode_reward_stream) self.episode_reward_writer.writerow(episode_reward_header) self.episode_reward_writer.writerow([mean_reward, std_reward])
def main(args, unknown_args): # noqa: C901 # path to the configuration file path = os.path.join(script_dir,'configs', args.config) # check if the algorithm is implemented if args.algo not in ALGOS: raise NotImplementedError('the algorithm specified has not been recognized !!') # parsing the config file and the args parser config_file = configparser.ConfigParser() config_file.read(path) n_timesteps = config_file.getint('ADAPT','total_timesteps') env_id = config_file['ADAPT']['environment'] n_eval_episodes = 5 n_eval_test = 5 eval_freq = 10 n_trials = 20 # Create the saving directory log_folder = os.path.join(script_dir,'saved_models') algo = args.algo folder = log_folder # if args.exp_id == 0: # args.exp_id = get_latest_run_id(os.path.join(folder, algo), env_id) # print(f"Loading latest experiment, id={args.exp_id}") # Sanity checks if args.exp_id > 0: log_path = os.path.join(folder, algo, f"{env_id}_{args.exp_id}") else: log_path = os.path.join(folder, algo) assert os.path.isdir(log_path), f"The {log_path} folder was not found" if args.load_best: model_path = os.path.join(log_path, "best_model.zip") found = os.path.isfile(model_path) if not found: raise ValueError(f"No model found for {algo} on {env_id}, path: {model_path}") off_policy_algos = ["qrdqn", "dqn", "ddpg", "sac", "her", "td3", "tqc"] if algo in off_policy_algos: args.n_envs = 1 set_random_seed(args.seed) stats_path = os.path.join(log_path, env_id) hyperparams, stats_path = get_saved_hyperparams(stats_path, norm_reward=False, test_mode=True) # load env_kwargs if existing env_kwargs = {} args_path = os.path.join(log_path, env_id, "args.yml") if os.path.isfile(args_path): with open(args_path, "r") as f: loaded_args = yaml.load(f, Loader=yaml.UnsafeLoader) # pytype: disable=module-attr if loaded_args["env_kwargs"] is not None: env_kwargs = loaded_args["env_kwargs"] env = Monitor(gym.make(f"deep_calibration:{env_id}"), log_path) eval_env = NormalizeActionWrapper(env) kwargs = dict(seed=args.seed) if algo in off_policy_algos: # Dummy buffer size as we don't need memory to enjoy the trained agent kwargs.update(dict(buffer_size=1)) # Check if we are running python 3.8+ # we need to patch saved model under python 3.6/3.7 to load them newer_python_version = sys.version_info.major == 3 and sys.version_info.minor >= 8 custom_objects = {} if newer_python_version: custom_objects = { "learning_rate": 0.0, "lr_schedule": lambda _: 0.0, "clip_range": lambda _: 0.0, } best_model = ALGOS[algo].load(model_path, env=env, custom_objects=custom_objects, **kwargs) obs = env.reset() np.set_printoptions(precision=5, suppress=True) try: # sample an observation from the environment and compute the action dists = [] actions = [] for i in range(n_eval_episodes): obs = eval_env.reset() action = best_model.predict(obs, deterministic = True)[0] action = eval_env.rescale_action(action) actions.append(action) dist = eval_env.distance_to_goal(action) print(f'distance to goal for config {i} = {dist:.6f}') dists.append(dist) # print(f'parameters for config {i} is {action}') print(f'mean distance = {np.mean(dists):.6f}') ind = np.argmin(dists) best_action = actions[ind] print(f'best distance = {dists[ind]:.6f}') print(f'best action = {best_action}') std_actions = std( actions , best_action) print(f'std actions = {std_actions}') print('########################################################') for i, action in enumerate(actions): print(f'config {i}') print(f'action = {action}') std_actions = std( actions , action) print(f'std actions = {std_actions}') dists = [] for i in range(n_eval_episodes): obs = eval_env.reset() dist = eval_env.distance_to_goal(action) dists.append(dist) print(f'distance to goal for config {i} = {dist:.6f}') print(f'mean distance = {np.mean(dists):.6f}') # # testing for random configurations # eval_env.rand = 1 # dists = [] # for i in range(n_eval_test): # obs = eval_env.reset() # action = best_model.predict(obs, deterministic = True)[0] # action = eval_env.rescale_action(action) # dist = eval_env.distance_to_goal(action) # print(f'best distance to goal for a random config {i} is {dist}') # dists.append(dist) # print('best random mean distance: ', np.mean(dists)) except KeyboardInterrupt: pass