Beispiel #1
0
 def f():
     if easy:
         env = gym.make("FetchReachAvoidSBEasy-v1")
     else:
         env = gym.make("FetchReachAvoidSB-v1")
     env.seed(seed)
     if monitored:
         return Monitor(env, None)
     else:
         return env
Beispiel #2
0
 def __init__(self,
              env,
              temperature: float = 1,
              tensorboard_log: Optional[str] = None):
     self.env = RewardWeightWrapper(env, None)
     self.temperature = temperature
     # Monitor allows PPO to log the reward it achieves
     monitored_env = Monitor(self.env, None, allow_early_resets=True)
     self.vec_env = DummyVecEnv([lambda: monitored_env])
     self.tensorboard_log = tensorboard_log
     self._reset_model()
Beispiel #3
0
def get_single_process_env(model_settings, model_path, ckpt_step):
    task = generate_task(model_settings['benchmarks']['task_generator_id'],
                         **model_settings['task_configs'])
    env = CausalWorld(task=task,
                      **model_settings['world_params'],
                      seed=model_settings['world_seed'])
    env = CurriculumWrapper(
        env,
        intervention_actors=model_settings["intervention_actors"],
        actives=model_settings["actives"])
    if ckpt_step is None:
        prefix = 0
    else:
        prefix = ckpt_step
    monitor_file = os.path.join(model_path, str(prefix))
    env = Monitor(env,
                  filename=monitor_file,
                  info_keywords=('fractional_success', ))

    return env
Beispiel #4
0
                                           ac_space,
                                           n_env,
                                           n_steps,
                                           n_batch,
                                           reuse,
                                           feature_extraction="mlp",
                                           **_kwargs)


device = torch.device("cuda")

#env = gym.make('CartPole-v1')
log_dir = "/home/mason/perls2/projects/rl_policy_env/policy_log/"
env = RLPolicyEnv('projects/rl_policy_env/rl_policy.yaml', False,
                  "TemplateEnv")
env = Monitor(env, log_dir)

timestep_count = 2000 * 101
#policy = FeedForwardPolicy(net_arch=[128, 128])
model = TRPO(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=timestep_count)
#model.save("trpo_cartpole")

#del model # remove to demonstrate saving and loading

#model = TRPO.load("trpo_cartpole")

ep_rewards = np.array(env.episode_rewards)
ep_lengths = np.array(env.episode_lengths)
ep_mean_rewards = ep_rewards / ep_lengths
Beispiel #5
0
def second_params(thislevel):
    env = gym.make('zhedLevel' + str(thislevel) + '-v0')
    env = Monitor(env, 'models/PPO2/logs/logSecond_' + str(thislevel))
    model = PPO2(MlpPolicy, env, cliprange=0.3, verbose=1)
    model.learn(total_timesteps=total_timesteps, log_interval=1)
    model.save('models/PPO2/ppo2_Slv' + str(thislevel))
Beispiel #6
0
def original_params(thislevel):
    env = gym.make('zhedLevel' + str(thislevel) + '-v0')
    env = Monitor(env, 'models/PPO2/logs/logOriginal_' + str(thislevel))
    model = PPO2(MlpPolicy, env, cliprange=0.1, verbose=1) #CP = 0.2
    model.learn(total_timesteps=total_timesteps, log_interval=1)
    model.save('models/PPO2/ppo2_Olv' + str(thislevel))
Beispiel #7
0
# Set the interval and their count
interval = 8760
icount = int(sys.argv[1]) if sys.argv is not None else 10
log_interval = 1
check_interval = 1
save_interval = 1

# the noise objects for DDPG
_, actions_spaces = env.get_state_action_spaces()

n_actions = 0
for action in actions_spaces:
    n_actions += action.shape[-1]

# Make VecEnv + Wrap in Monitor
env = Monitor(env, filename=log_dir)
callbackBest = SaveOnBestTrainingRewardCallback2_10(
    check_freq=check_interval * interval,
    log_dir=log_dir,
    save_freq=interval * save_interval)

# Add callbacks to the callback list
callbackList = []
useBestCallback = True

if useBestCallback:
    callbackList.append(callbackBest)

# Algo setup
param_noise = None
action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
Beispiel #8
0
from stable_baselines.common.env_checker import check_env

from simulation.RL_env import SimpleSat
from simulation.Simulation import SatelliteSim

Sim = SatelliteSim()
time_step = Sim.PERIOD / Sim.CIRCUNFERENCE
env = SimpleSat(Sim, time_step)

# It will check your custom environment and output additional warnings if needed
check_env(env)
env.close()

from stable_baselines import PPO2 as agent
from stable_baselines.common.evaluation import evaluate_policy
#from stable_baselines.deepq.policies import MlpPolicy as policy
from stable_baselines.common.policies import MlpPolicy as policy
from stable_baselines.bench.monitor import Monitor

model = agent(policy, env, verbose=0)

env = Monitor(env, filename="RL/Log_RL")
# Train the agent for 10000 steps
model.learn(total_timesteps=10000)

mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
model.save("RL/Agent")
Beispiel #9
0
 def _init():
     env = gym.make(env_name)
     env = TimeLimit(env, timestep_limit)
     env = Monitor(env, log_folder + 'seed_' + str(seed + rank))
     env.seed(seed + rank)
     return env
Beispiel #10
0
    return args


if __name__ == "__main__":
    args = initialize()
    is_ro = False
    is_adjusted_lr = False
    if args.algorithm == "ro":
        is_ro = True
    if args.algorithm == "ro_adjusted_lr":
        is_ro = True
        is_adjusted_lr = True

    best_mean_reward, n_steps = -np.inf, 0

    env = Monitor(gym.make(args.env), args.log_dir + "monitor_train/", allow_early_resets=True)
    # env = VecNormalize(env, norm_obs=True, norm_reward=False, clip_obs=10.)
    env.seed(args.seed)
    test_env = Monitor(gym.make(args.env), args.log_dir + "monitor_eval/", allow_early_resets=True)
    # test_env = VecNormalize(test_env, norm_obs=True, norm_reward=False, clip_obs=10.)
    test_env.seed(args.seed)

    noise_std = 0.1
    n_actions = env.action_space.shape[0]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions))
    model = OurDDPG(MlpPolicy,
                    env,
                    seed=args.seed,
                    verbose=2,
                    normalize_observations=False,
                    action_noise=action_noise,