def f(): if easy: env = gym.make("FetchReachAvoidSBEasy-v1") else: env = gym.make("FetchReachAvoidSB-v1") env.seed(seed) if monitored: return Monitor(env, None) else: return env
def __init__(self, env, temperature: float = 1, tensorboard_log: Optional[str] = None): self.env = RewardWeightWrapper(env, None) self.temperature = temperature # Monitor allows PPO to log the reward it achieves monitored_env = Monitor(self.env, None, allow_early_resets=True) self.vec_env = DummyVecEnv([lambda: monitored_env]) self.tensorboard_log = tensorboard_log self._reset_model()
def get_single_process_env(model_settings, model_path, ckpt_step): task = generate_task(model_settings['benchmarks']['task_generator_id'], **model_settings['task_configs']) env = CausalWorld(task=task, **model_settings['world_params'], seed=model_settings['world_seed']) env = CurriculumWrapper( env, intervention_actors=model_settings["intervention_actors"], actives=model_settings["actives"]) if ckpt_step is None: prefix = 0 else: prefix = ckpt_step monitor_file = os.path.join(model_path, str(prefix)) env = Monitor(env, filename=monitor_file, info_keywords=('fractional_success', )) return env
ac_space, n_env, n_steps, n_batch, reuse, feature_extraction="mlp", **_kwargs) device = torch.device("cuda") #env = gym.make('CartPole-v1') log_dir = "/home/mason/perls2/projects/rl_policy_env/policy_log/" env = RLPolicyEnv('projects/rl_policy_env/rl_policy.yaml', False, "TemplateEnv") env = Monitor(env, log_dir) timestep_count = 2000 * 101 #policy = FeedForwardPolicy(net_arch=[128, 128]) model = TRPO(MlpPolicy, env, verbose=1) model.learn(total_timesteps=timestep_count) #model.save("trpo_cartpole") #del model # remove to demonstrate saving and loading #model = TRPO.load("trpo_cartpole") ep_rewards = np.array(env.episode_rewards) ep_lengths = np.array(env.episode_lengths) ep_mean_rewards = ep_rewards / ep_lengths
def second_params(thislevel): env = gym.make('zhedLevel' + str(thislevel) + '-v0') env = Monitor(env, 'models/PPO2/logs/logSecond_' + str(thislevel)) model = PPO2(MlpPolicy, env, cliprange=0.3, verbose=1) model.learn(total_timesteps=total_timesteps, log_interval=1) model.save('models/PPO2/ppo2_Slv' + str(thislevel))
def original_params(thislevel): env = gym.make('zhedLevel' + str(thislevel) + '-v0') env = Monitor(env, 'models/PPO2/logs/logOriginal_' + str(thislevel)) model = PPO2(MlpPolicy, env, cliprange=0.1, verbose=1) #CP = 0.2 model.learn(total_timesteps=total_timesteps, log_interval=1) model.save('models/PPO2/ppo2_Olv' + str(thislevel))
# Set the interval and their count interval = 8760 icount = int(sys.argv[1]) if sys.argv is not None else 10 log_interval = 1 check_interval = 1 save_interval = 1 # the noise objects for DDPG _, actions_spaces = env.get_state_action_spaces() n_actions = 0 for action in actions_spaces: n_actions += action.shape[-1] # Make VecEnv + Wrap in Monitor env = Monitor(env, filename=log_dir) callbackBest = SaveOnBestTrainingRewardCallback2_10( check_freq=check_interval * interval, log_dir=log_dir, save_freq=interval * save_interval) # Add callbacks to the callback list callbackList = [] useBestCallback = True if useBestCallback: callbackList.append(callbackBest) # Algo setup param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
from stable_baselines.common.env_checker import check_env from simulation.RL_env import SimpleSat from simulation.Simulation import SatelliteSim Sim = SatelliteSim() time_step = Sim.PERIOD / Sim.CIRCUNFERENCE env = SimpleSat(Sim, time_step) # It will check your custom environment and output additional warnings if needed check_env(env) env.close() from stable_baselines import PPO2 as agent from stable_baselines.common.evaluation import evaluate_policy #from stable_baselines.deepq.policies import MlpPolicy as policy from stable_baselines.common.policies import MlpPolicy as policy from stable_baselines.bench.monitor import Monitor model = agent(policy, env, verbose=0) env = Monitor(env, filename="RL/Log_RL") # Train the agent for 10000 steps model.learn(total_timesteps=10000) mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100) print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}") model.save("RL/Agent")
def _init(): env = gym.make(env_name) env = TimeLimit(env, timestep_limit) env = Monitor(env, log_folder + 'seed_' + str(seed + rank)) env.seed(seed + rank) return env
return args if __name__ == "__main__": args = initialize() is_ro = False is_adjusted_lr = False if args.algorithm == "ro": is_ro = True if args.algorithm == "ro_adjusted_lr": is_ro = True is_adjusted_lr = True best_mean_reward, n_steps = -np.inf, 0 env = Monitor(gym.make(args.env), args.log_dir + "monitor_train/", allow_early_resets=True) # env = VecNormalize(env, norm_obs=True, norm_reward=False, clip_obs=10.) env.seed(args.seed) test_env = Monitor(gym.make(args.env), args.log_dir + "monitor_eval/", allow_early_resets=True) # test_env = VecNormalize(test_env, norm_obs=True, norm_reward=False, clip_obs=10.) test_env.seed(args.seed) noise_std = 0.1 n_actions = env.action_space.shape[0] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions)) model = OurDDPG(MlpPolicy, env, seed=args.seed, verbose=2, normalize_observations=False, action_noise=action_noise,