def ppo1_nmileg_pool(sensory_value): RL_method = "PPO1" # total_MC_runs = 50 experiment_ID = "handtest_rot_pool_with_MC_C_task0/" save_name_extension = RL_method total_timesteps = 500000 sensory_info = "sensory_{}".format(sensory_value) current_mc_run_num =22 #starts from 0 for mc_cntr in range(current_mc_run_num, current_mc_run_num+1): log_dir = "./logs/{}/MC_{}/{}/{}/".format(experiment_ID, mc_cntr, RL_method, sensory_info) # defining the environments env = gym.make('HandManipulate-v1{}'.format(sensory_value)) #env = gym.wrappers.Monitor(env, "./tmp/gym-results", video_callable=False, force=True) ## setting the Monitor env = gym.wrappers.Monitor(env, log_dir+"Monitor/", video_callable=False, force=True, uid="Monitor_info") # defining the initial model if RL_method == "PPO1": model = PPO1(common_MlpPolicy, env, verbose=1, tensorboard_log=log_dir) elif RL_method == "PPO2": env = DummyVecEnv([lambda: env]) model = PPO2(common_MlpPolicy, env, verbose=1, tensorboard_log=log_dir) elif RL_method == "DDPG": env = DummyVecEnv([lambda: env]) n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5)* 5 * np.ones(n_actions)) model = DDPG(DDPG_MlpPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, tensorboard_log=log_dir) else: raise ValueError("Invalid RL mode") # setting the environment on the model #model.set_env(env) # setting the random seed for some of the random instances random_seed = mc_cntr random.seed(random_seed) env.seed(random_seed) env.action_space.seed(random_seed) np.random.seed(random_seed) tf.random.set_random_seed(random_seed) # training the model # training the model model.learn(total_timesteps=total_timesteps) # saving the trained model model.save(log_dir+"/model") return None
def train_initial_policy( model_name, algo=ALGO, env_name=ENV_NAME, time_steps=TIME_STEPS): """Uses the specified algorithm on the target environment""" print("Using algorithm : ", algo.__name__) print("Model saved as : ", "data/models/" +algo.__name__+"_initial_policy_"+env_name+"_.pkl") # define the environment here env = gym.make(env_name) env.seed(SEED) if NOISE_VALUE>0 : env = NoisyRealEnv(env, noise_value=NOISE_VALUE) if MUJOCO_NORMALIZE: env = MujocoNormalized(env) print('~~ ENV Obs RANGE : ', env.observation_space.low, env.observation_space.high) print('~~~ ENV Action RANGE : ', env.action_space.low, env.action_space.high) if algo.__name__ == "ACKTR": print('Using SubprovVecEnv') env = SubprocVecEnv([lambda: env for i in range(8)]) elif algo.__name__ == "SAC": print('Using standard gym environment') env = env else: print('Using Dummy Vec Env') env = DummyVecEnv([lambda : env]) if NORMALIZE : env = VecNormalize(env, training=True, norm_obs=True, norm_reward=False, clip_reward=1e6, ) with open('data/target_policy_params.yaml') as file: args = yaml.load(file, Loader=yaml.FullLoader) args = args[algo.__name__][PARAMS_ENV] print('~~ Loaded args file ~~') if algo.__name__ == "SAC": print('Initializing SAC with RLBaselinesZoo hyperparameters .. ') print('using 256 node architecture as in the paper') class CustomPolicy(ffp_sac): def __init__(self, *args, **kwargs): super(CustomPolicy, self).__init__(*args, **kwargs, feature_extraction="mlp", layers=[256, 256]) model = SAC(CustomPolicy, env, verbose=1, tensorboard_log='data/TBlogs/initial_policy_training', batch_size=args['batch_size'], buffer_size=args['buffer_size'], ent_coef=args['ent_coef'], learning_starts=args['learning_starts'], learning_rate=args['learning_rate'], train_freq=args['train_freq'], seed=SEED, ) elif algo.__name__ == "TD3": print('Initializing TD3 with RLBaselinesZoo hyperparameters .. ') # hyperparameters suggestions from : # https://github.com/araffin/rl-baselines-zoo/blob/master/trained_agents/td3/HopperBulletEnv-v0/config.yml n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=float(args['noise_std']) * np.ones(n_actions)) class CustomPolicy2(ffp_td3): def __init__(self, *args, **kwargs): super(CustomPolicy2, self).__init__(*args, **kwargs, feature_extraction="mlp", layers=[400, 300]) model = TD3(CustomPolicy2, env, verbose = 1, tensorboard_log = 'data/TBlogs/initial_policy_training', batch_size = args['batch_size'], buffer_size = args['buffer_size'], gamma = args['gamma'], gradient_steps = args['gradient_steps'], learning_rate = args['learning_rate'], learning_starts = args['learning_starts'], action_noise = action_noise, train_freq=args['train_freq'], seed=SEED, ) elif algo.__name__ == "TRPO": print('Initializing TRPO with RLBaselinesZoo hyperparameters .. ') # hyperparameters suggestions from : # https://github.com/araffin/rl-baselines-zoo/blob/master/trained_agents/sac/HopperBulletEnv-v0/config.yml model = TRPO(mlp_standard, env, verbose=1, tensorboard_log='data/TBlogs/initial_policy_training', timesteps_per_batch=args['timesteps_per_batch'], lam=args['lam'], max_kl=args['max_kl'], gamma=args['gamma'], vf_iters=args['vf_iters'], vf_stepsize=args['vf_stepsize'], entcoeff=args['entcoeff'], cg_damping=args['cg_damping'], cg_iters=args['cg_iters'], seed=SEED, ) elif algo.__name__ == "ACKTR": print('Initializing ACKTR') model = ACKTR(mlp_standard, env, verbose=1, n_steps=128, ent_coef=0.01, lr_schedule='constant', learning_rate=0.0217, max_grad_norm=0.5, gamma=0.99, vf_coef=0.946, seed=SEED) elif algo.__name__ == "PPO2": print('Initializing PPO2') print('Num envs : ', env.num_envs) model = PPO2(mlp_standard, env, n_steps=int(args['n_steps']/env.num_envs), nminibatches=args['nminibatches'], lam=args['lam'], gamma=args['gamma'], ent_coef=args['ent_coef'], noptepochs=args['noptepochs'], learning_rate=args['learning_rate'], cliprange=args['cliprange'], verbose=1, tensorboard_log='data/TBlogs/initial_policy_training', seed=SEED, ) else: print('No algorithm matched. Using SAC .. ') model = SAC(CustomPolicy, env, verbose=1, batch_size=args['batch_size'], buffer_size=args['buffer_size'], ent_coef=args['ent_coef'], learning_starts=args['learning_starts'], learning_rate=args['learning_rate'], train_freq=args['train_freq'], seed=SEED, ) # change model name if using normalization if NORMALIZE: model_name = model_name.replace('.pkl', 'normalized_.pkl') elif MUJOCO_NORMALIZE: model_name = model_name.replace('.pkl', 'mujoco_norm_.pkl') if SAVE_BEST_FOR_20: model.learn(total_timesteps=time_steps, tb_log_name=model_name, log_interval=10, callback=eval_callback) save_the_model() model_name = model_name.replace('best_', '') model.save(model_name) elif SAVE_INTERMEDIATE: check_callback = CheckpointCallback(save_freq=SAVE_FREQ, save_path=model_name[:-4], name_prefix=ENV_NAME + '_' + str(SEED), verbose=1, ) eval_env = DummyVecEnv([lambda: gym.make(ENV_NAME)]) eval_env.seed(SEED) eval_callback = EvalCallback(eval_env, n_eval_episodes=10, eval_freq=SAVE_FREQ, log_path=model_name[:-4], deterministic=False, render=False, verbose=1) callbacks = CallbackList([check_callback, eval_callback]) model.learn(total_timesteps=time_steps, tb_log_name=model_name.split('/')[-1], log_interval=10, callback=callbacks) model.save(model_name) npzfile = np.load(model_name[:-4] + '/evaluations.npz') average_rewards = np.mean(npzfile['results'], axis=1)[:, 0] with open(model_name[:-4] + "/eval_results.txt", "a") as f: for i in range(np.shape(average_rewards)[0]): f.write("{}, {}\n".format(npzfile['timesteps'][i], average_rewards[i])) evaluate_policy_on_env(env, model, render=False, iters=50) else: model.learn(total_timesteps=time_steps, tb_log_name=model_name.split('/')[-1], log_interval=10,) model.save(model_name) evaluate_policy_on_env(env, model, render=False, iters=50) # save the environment params if NORMALIZE: # env.save(model_name.replace('.pkl', 'stats_.pkl')) env.save('data/models/env_stats/'+env_name+'.pkl') print('done :: ', model_name) exit()
def main(): args = get_args() choose_device(args.device) set_global_seeds(args.seed) env_id = args.env exp_id = args.exp_id algo = args.algo env_name = env_id[:-3] env_index = env_list.index(env_id) # Pass CustomEnv arguments: follow this for your CustomEnv if reward not known prior to training env_kwargs = {} if args.env_kwargs is None else args.env_kwargs if (args.env_kwargs is not None) and (env_id in ['AirSim-v0']): if 'rew_land' in env_kwargs: if (int(env_kwargs['rew_land']) in [500, 1000, 10000]): env_success[-1] = int(env_kwargs['rew_land']) else: raise ValueError( 'Given env reward not acceptable. Please try again') params = [exp_id, env_name.lower()] folder = [exp_id, env_name.lower(), args.algo.lower()] tensorboard_path, monitor_path, callback_path = None, None, None if args.tensorboard: tensorboard_path = "tensorboard/{}_{}".format(*params) make_dir(tensorboard_path) # if args.train_RL: # Begin training here (location of this condition also decides experiment performance) # Load hyperparameters from yaml file with open('hyperparams/{}.yml'.format(args.algo), 'r') as f: hyperparams_dict = yaml.safe_load(f) if env_id in list(hyperparams_dict.keys()): hyperparams = hyperparams_dict[env_id] else: raise ValueError("Hyperparameters not found for {}-{}".format( args.algo, env_id)) if args.hyperparams is not None: # Overwrite hyperparams if needed hyperparams.update(args.hyperparams) # OPTIONAL: Print saved hyperparams saved_hyperparams = OrderedDict([(key, hyperparams[key]) for key in sorted(hyperparams.keys())]) if args.verbose > 0: pprint(saved_hyperparams) if args.n_envs > 1: # if args.verbose: print("Overwriting n_envs with n={}".format(args.n_envs)) n_envs = args.n_envs else: n_envs = hyperparams.get('n_envs', 1) # choose Monitor log path according to multiprocessing setting if args.monitor: if n_envs == 1: monitor_path = 'logs/single/{}_{}_{}'.format(*folder) else: if algo not in ['dqn', 'her', 'sac', 'td3']: monitor_path = 'logs/multi/{}_{}_{}'.format(*folder) make_dir(monitor_path) if int(float(args.timesteps_RL)) > 0: # if args.verbose: print("Overwriting n_timesteps with n={}".format( int(float(args.timesteps_RL)))) n_timesteps = int(float(args.timesteps_RL)) else: n_timesteps = int(hyperparams['n_timesteps']) # Convert to python object if needed if 'policy_kwargs' in hyperparams.keys() and isinstance( hyperparams['policy_kwargs'], str): hyperparams['policy_kwargs'] = eval(hyperparams['policy_kwargs']) if 'n_envs' in hyperparams.keys(): del hyperparams['n_envs'] del hyperparams['n_timesteps'] #To avoid error env_wrapper = get_wrapper_class(hyperparams) if 'env_wrapper' in hyperparams.keys(): del hyperparams['env_wrapper'] # if (algo=='ppo2' and ('learning_rate' in hyperparams.keys())): # hyperparams['learning_rate'] = linear_schedule(hyperparams['learning_rate']) def create_env(n_envs, eval_env=False): if algo in ['a2c', 'acer', 'acktr', 'ppo2']: if n_envs > 1: env = SubprocVecEnv([ make_env(env_id, i, args.seed, log_dir=monitor_path, wrapper_class=env_wrapper, env_kwargs=env_kwargs) for i in range(n_envs) ]) else: env = DummyVecEnv([ make_env(env_id, 0, args.seed, log_dir=monitor_path, wrapper_class=env_wrapper, env_kwargs=env_kwargs) ]) env = DummyVecEnv([lambda: gym.make(env_id, **env_kwargs)]) if env_wrapper is not None: env = env_wrapper(env) elif ((algo in ['dqn', 'her', 'sac', 'td3']) and n_envs > 1): raise ValueError( "Error: {} does not support multiprocessing!".format(algo)) elif ((algo in ['ddpg', 'ppo1', 'trpo', 'gail']) and n_envs > 1): raise ValueError( "Error: {} uses MPI for multiprocessing!".format(algo)) else: env = make_vec_env(env_id, n_envs=n_envs, seed=args.seed, monitor_dir=monitor_path, wrapper_class=env_wrapper, env_kwargs=env_kwargs) if args.normalize: # choose from multiple options # env = VecNormalize(env, clip_obs=np.inf) env = VecNormalize(env, norm_reward=False, clip_obs=np.inf) # env = VecNormalize(env, norm_reward=False, clip_obs=np.inf, **normalize_kwargs) return env # Zoo: env = SubprocVecEnv([make_env(env_id, i, seed, log_dir, wrapper_class=env_wrapper, env_kwargs=env_kwargs) for i in range(n_envs)]) # Zoo: env = DummyVecEnv([make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper, env_kwargs=env_kwargs)]) env = create_env(n_envs) # if args.train_RL: # checking impact of the if-condition position on experiment reproducibility callback, callback_path = [], "callbacks/{}_{}_{}".format(*folder) save_freq, eval_freq = 100 * episode_len[env_index], 100 * episode_len[ env_index] save_freq, eval_freq = max(save_freq // n_envs, 1), max(eval_freq // n_envs, 1) make_dir(callback_path) if args.check_callback: callback.append( CheckpointCallback(save_freq=save_freq, save_path=callback_path, name_prefix='rl_model', verbose=1)) if args.eval_callback: callback.append( EvalCallback(create_env(1, eval_env=True), best_model_save_path=callback_path, log_path=callback_path, eval_freq=eval_freq, verbose=1)) model = (algo_list[args.algo])(env=env, seed=args.seed, tensorboard_log=tensorboard_path, n_cpu_tf_sess=1, verbose=args.verbose, **hyperparams) print('\nTraining {} on {} now... \n'.format(algo, env_id)) start_time = time.time() model.learn(total_timesteps=n_timesteps, callback=callback) total_time = time.time() - start_time if args.normalize: env.save(os.path.join(callback_path, "vec_normalize.pkl")) if n_envs > 1 or (algo in ['ddpg', 'trpo', 'gail']): print("Took {:.2f}s for multiprocessed version - {:.2f} FPS".format( total_time, n_timesteps / total_time)) else: print("Took {:.2f}s for single process version - {:.2f} FPS".format( total_time, n_timesteps / total_time)) env = DummyVecEnv([make_env(env_id, 0, args.seed, env_kwargs=env_kwargs)]) if args.normalize: env = VecNormalize.load( os.path.join(callback_path, "vec_normalize.pkl"), env) env.training = False env.norm_reward = False env.seed(args.seed) # Evaluate RL model - choose either best model or last available model model = (algo_list[algo]).load(os.path.join(callback_path, 'best_model')) # model = (algo_list[algo]).load("models/{}_{}_{}".format(*folder)) model.set_env(env) evaluate('policy', model, env_id, env, algo, 100) if args.monitor: results_plotter.plot_results([monitor_path], n_timesteps, results_plotter.X_TIMESTEPS, "{} {}".format(algo, env_id)) plot_results(monitor_path) if args.test: print('\nTesting policy...\n') obs = env.reset() for _ in range(n_timesteps): action, _states = model.predict(obs, deterministic=True) if isinstance(env.action_space, gym.spaces.Box): action = np.clip(action, env.action_space.low, env.action_space.high) obs, rewards, dones, info = env.step(action) episode_reward += rewards env.render() if dones: done_count += 1 success_count = check_success(env_index, env_success, success_count) total_reward += episode_reward episode_reward = 0 env.reset() print('\n{}/{} successful episodes'.format(success_count, done_count)) average_reward = total_reward / done_count print('\nAverage reward: {}'.format(average_reward)) env.close()
class BaseRLModel(ABC): """ The base RL model :param policy: (BasePolicy) Policy object :param env: (Gym environment) The environment to learn from (if registered in Gym, can be str. Can be None for loading trained models) :param policy_base: (BasePolicy) the base policy used by this method :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 debug :param support_multi_env: (bool) Whether the algorithm supports training with multiple environments (as in A2C) :param create_eval_env: (bool) Whether to create a second environment that will be used for evaluating the agent periodically. (Only available when passing string for the environment) :param monitor_wrapper: (bool) When creating an environment, whether to wrap it or not in a Monitor wrapper. :param seed: (int) Seed for the pseudo random generators """ def __init__(self, policy, env, policy_base, policy_kwargs=None, verbose=0, device='auto', support_multi_env=False, create_eval_env=False, monitor_wrapper=True, seed=None): if isinstance(policy, str) and policy_base is not None: self.policy_class = get_policy_from_name(policy_base, policy) else: self.policy_class = policy self.env = env # get VecNormalize object if needed self._vec_normalize_env = unwrap_vec_normalize(env) self.verbose = verbose self.policy_kwargs = {} if policy_kwargs is None else policy_kwargs self.observation_space = None self.action_space = None self.n_envs = None self.num_timesteps = 0 self.eval_env = None self.replay_buffer = None self.seed = seed self.action_noise = None # Track the training progress (from 1 to 0) # this is used to update the learning rate self._current_progress = 1 # Create and wrap the env if needed if env is not None: if isinstance(env, str): if create_eval_env: eval_env = gym.make(env) if monitor_wrapper: eval_env = Monitor(eval_env, filename=None) self.eval_env = DummyVecEnv([lambda: eval_env]) if self.verbose >= 1: print( "Creating environment from the given name, wrapped in a DummyVecEnv." ) env = gym.make(env) if monitor_wrapper: env = Monitor(env, filename=None) env = DummyVecEnv([lambda: env]) self.observation_space = env.observation_space self.action_space = env.action_space if not isinstance(env, VecEnv): if self.verbose >= 1: print("Wrapping the env in a DummyVecEnv.") env = DummyVecEnv([lambda: env]) self.n_envs = env.num_envs self.env = env if not support_multi_env and self.n_envs > 1: raise ValueError( "Error: the model does not support multiple envs requires a single vectorized" " environment.") def _get_eval_env(self, eval_env): """ Return the environment that will be used for evaluation. :param eval_env: (gym.Env or VecEnv) :return: (VecEnv) """ if eval_env is None: eval_env = self.eval_env if eval_env is not None: if not isinstance(eval_env, VecEnv): eval_env = DummyVecEnv([lambda: eval_env]) assert eval_env.num_envs == 1 return eval_env def scale_action(self, action): """ Rescale the action from [low, high] to [-1, 1] (no need for symmetric action space) :param action: (np.ndarray) :return: (np.ndarray) """ low, high = self.action_space.low, self.action_space.high return 2.0 * ((action - low) / (high - low)) - 1.0 def unscale_action(self, scaled_action): """ Rescale the action from [-1, 1] to [low, high] (no need for symmetric action space) :param scaled_action: (np.ndarray) :return: (np.ndarray) """ low, high = self.action_space.low, self.action_space.high return low + (0.5 * (scaled_action + 1.0) * (high - low)) def _setup_learning_rate(self): """Transform to callable if needed.""" self.learning_rate = get_schedule_fn(self.learning_rate) def _update_current_progress(self, num_timesteps, total_timesteps): """ Compute current progress (from 1 to 0) :param num_timesteps: (int) current number of timesteps :param total_timesteps: (int) """ self._current_progress = 1.0 - float(num_timesteps) / float( total_timesteps) def _update_learning_rate(self, optimizers): """ Update the optimizers learning rate using the current learning rate schedule and the current progress (from 1 to 0). :param optimizers: ([th.optim.Optimizer] or Optimizer) An optimizer or a list of optimizer. """ # Log the current learning rate logger.logkv("learning_rate", self.learning_rate(self._current_progress)) # if not isinstance(optimizers, list): # optimizers = [optimizers] # for optimizer in optimizers: # update_learning_rate(optimizer, self.learning_rate(self._current_progress)) @staticmethod def safe_mean(arr): """ Compute the mean of an array if there is at least one element. For empty array, return nan. It is used for logging only. :param arr: (np.ndarray) :return: (float) """ return np.nan if len(arr) == 0 else np.mean(arr) def get_env(self): """ returns the current environment (can be None if not defined) :return: (gym.Env) The current environment """ return self.env def set_env(self, env): """ :param env: (gym.Env) The environment for learning a policy """ raise NotImplementedError() @abstractmethod def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="run", eval_env=None, eval_freq=-1, n_eval_episodes=5, reset_num_timesteps=True): """ Return a trained model. :param total_timesteps: (int) The total number of samples to train on :param callback: (function (dict, dict)) -> boolean function called at every steps with state of the algorithm. It takes the local and global variables. If it returns False, training is aborted. :param log_interval: (int) The number of timesteps before logging. :param tb_log_name: (str) the name of the run for tensorboard log :param reset_num_timesteps: (bool) whether or not to reset the current timestep number (used in logging) :param eval_env: (gym.Env) Environment that will be used to evaluate the agent :param eval_freq: (int) Evaluate the agent every `eval_freq` timesteps (this may vary a little) :param n_eval_episodes: (int) Number of episode to evaluate the agent :return: (BaseRLModel) the trained model """ pass @abstractmethod def predict(self, observation, state=None, mask=None, deterministic=False): """ Get the model's action from an observation :param observation: (np.ndarray) the input observation :param state: (np.ndarray) The last states (can be None, used in recurrent policies) :param mask: (np.ndarray) The last masks (can be None, used in recurrent policies) :param deterministic: (bool) Whether or not to return deterministic actions. :return: (np.ndarray, np.ndarray) the model's action and the next state (used in recurrent policies) """ pass def set_random_seed(self, seed=None): """ Set the seed of the pseudo-random generators (python, numpy, pytorch, gym, action_space) :param seed: (int) """ if seed is None: return set_random_seed(seed) self.action_space.seed(seed) if self.env is not None: self.env.seed(seed) if self.eval_env is not None: self.eval_env.seed(seed) def _setup_learn(self, eval_env): """ Initialize different variables needed for training. :param eval_env: (gym.Env or VecEnv) :return: (int, int, [float], np.ndarray, VecEnv) """ self.start_time = time.time() self.ep_info_buffer = deque(maxlen=100) self.ep_reward_buffer = deque(maxlen=40) if self.action_noise is not None: self.action_noise.reset() timesteps_since_eval, episode_num = 0, 0 evaluations = [] if eval_env is not None and self.seed is not None: eval_env.seed(self.seed) eval_env = self._get_eval_env(eval_env) obs = self.env.reset() return timesteps_since_eval, episode_num, evaluations, obs, eval_env def _update_reward_buffer(self, rewards): rewards = np.array(rewards) self.ep_reward_buffer.extend(np.mean(rewards, axis=0).tolist()) def _update_info_buffer(self, infos): """ Retrieve reward and episode length and update the buffer if using Monitor wrapper. :param infos: ([dict]) """ for info in infos: maybe_ep_info = info.get('episode') if maybe_ep_info is not None: self.ep_info_buffer.extend([maybe_ep_info]) def _eval_policy(self, eval_freq, eval_env, n_eval_episodes, timesteps_since_eval, deterministic=True): """ Evaluate the current policy on a test environment. :param eval_env: (gym.Env) Environment that will be used to evaluate the agent :param eval_freq: (int) Evaluate the agent every `eval_freq` timesteps (this may vary a little) :param n_eval_episodes: (int) Number of episode to evaluate the agent :parma timesteps_since_eval: (int) Number of timesteps since last evaluation :param deterministic: (bool) Whether to use deterministic or stochastic actions :return: (int) Number of timesteps since last evaluation """ if 0 < eval_freq <= timesteps_since_eval and eval_env is not None: timesteps_since_eval %= eval_freq # Synchronise the normalization stats if needed sync_envs_normalization(self.env, eval_env) mean_reward, std_reward = evaluate_policy( self, eval_env, n_eval_episodes, deterministic=deterministic) if self.verbose > 0: print("Eval num_timesteps={}, " "episode_reward={:.2f} +/- {:.2f}".format( self.num_timesteps, mean_reward, std_reward)) print("FPS: {:.2f}".format(self.num_timesteps / (time.time() - self.start_time))) return timesteps_since_eval