def make_env(rank, seed=0): num_boxes = 1 alg_version = 0 dim_room = (7, 7) train_mode = 'mlp' agent_lb_path = None agent_ub_path = None """ Utility function for multiprocessed env. :param env_id: (str) the environment ID :param num_env: (int) the number of environments you wish to have in subprocesses :param seed: (int) the inital seed for RNG :param rank: (int) index of the subprocess """ def _init(): env = ALGEnv(dim_room=dim_room, num_boxes=num_boxes, train_mode=train_mode, alg_version=alg_version, agent_lb_path=agent_lb_path, agent_ub_path=agent_ub_path) env.seed(seed + rank) return env set_random_seed(seed) return _init
def enjoy(stats_path, model_path, dataset, body_id, algo, n_timesteps=200, test_time=3, render=False, seed=0): dataset_name, env_id, train_files, train_params, train_names, test_files, test_params, test_names = load_dataset.load_dataset( dataset, seed=0, shuffle=False, train_proportion=1) set_random_seed(seed * 128 + 127) hyperparams, stats_path = get_saved_hyperparams(stats_path, norm_reward=False, test_mode=True) env_kwargs = { "xml": train_files[body_id], "param": train_params[body_id], "max_episode_steps": n_timesteps + 1, "render": render, } env = create_test_env( env_id, n_envs=1, stats_path=stats_path, seed=seed, log_dir="tmp/", should_render=False, hyperparams=hyperparams, env_kwargs=env_kwargs, ) kwargs = dict(seed=seed) model = ALGOS[algo].load(model_path, env=env, **kwargs) obs = env.reset() state = None episode_reward = 0.0 episode_rewards, episode_lengths = [], [] ep_len = 0 body_x_record = [] for _run in range(test_time): body_x = 0 for _step in range(n_timesteps): action, state = model.predict(obs, state=state, deterministic=True) if isinstance(env.action_space, gym.spaces.Box): action = np.clip(action, env.action_space.low, env.action_space.high) body_x = env.envs[0].robot.body_xyz[0] obs, reward, done, infos = env.step(action) episode_reward += reward[0] ep_len += 1 if render: sleep(0.01) if done: break body_x_record.append(body_x) obs = env.reset() body_x_record = np.array(body_x_record) env.close() return body_x_record
def make_env(env_id, rank, seed=0, sigma=0.1): def _init(): env = gym.make(env_id, sigma=sigma, r=(rank + 1) / 10) env.seed(seed + rank) return env set_random_seed(seed) return _init
def make_env(env_id, rank, seed=0): def _init(): env = gym.make(env_id) env.seed(seed + rank) return env set_random_seed(seed) return _init
def set_seed(seed=seed): os.environ['PYTHONHASHSEED'] = str(seed) _, seed = seeding.np_random(seed) random.seed(seed) set_random_seed(seed) np.random.seed(seed) torch.manual_seed(seed)
def test_kl_divergence(dist_type): set_random_seed(8) # Test 1: same distribution should have KL Div = 0 dist1 = dist_type dist2 = dist_type # PyTorch implementation of kl_divergence doesn't sum across dimensions assert th.allclose(kl_divergence(dist1, dist2).sum(), th.tensor(0.0)) # Test 2: KL Div = E(Unbiased approx KL Div) if isinstance(dist_type, CategoricalDistribution): dist1 = dist_type.proba_distribution(th.rand(N_ACTIONS).repeat(N_SAMPLES, 1)) # deepcopy needed to assign new memory to new distribution instance dist2 = deepcopy(dist_type).proba_distribution(th.rand(N_ACTIONS).repeat(N_SAMPLES, 1)) elif isinstance(dist_type, DiagGaussianDistribution) or isinstance(dist_type, SquashedDiagGaussianDistribution): mean_actions1 = th.rand(1).repeat(N_SAMPLES, 1) log_std1 = th.rand(1).repeat(N_SAMPLES, 1) mean_actions2 = th.rand(1).repeat(N_SAMPLES, 1) log_std2 = th.rand(1).repeat(N_SAMPLES, 1) dist1 = dist_type.proba_distribution(mean_actions1, log_std1) dist2 = deepcopy(dist_type).proba_distribution(mean_actions2, log_std2) elif isinstance(dist_type, BernoulliDistribution): dist1 = dist_type.proba_distribution(th.rand(1).repeat(N_SAMPLES, 1)) dist2 = deepcopy(dist_type).proba_distribution(th.rand(1).repeat(N_SAMPLES, 1)) elif isinstance(dist_type, MultiCategoricalDistribution): dist1 = dist_type.proba_distribution(th.rand(1, sum([N_ACTIONS, N_ACTIONS])).repeat(N_SAMPLES, 1)) dist2 = deepcopy(dist_type).proba_distribution(th.rand(1, sum([N_ACTIONS, N_ACTIONS])).repeat(N_SAMPLES, 1)) elif isinstance(dist_type, StateDependentNoiseDistribution): dist1 = StateDependentNoiseDistribution(1) dist2 = deepcopy(dist1) state = th.rand(1, N_FEATURES).repeat(N_SAMPLES, 1) mean_actions1 = th.rand(1).repeat(N_SAMPLES, 1) mean_actions2 = th.rand(1).repeat(N_SAMPLES, 1) _, log_std = dist1.proba_distribution_net(N_FEATURES, log_std_init=th.log(th.tensor(0.2))) dist1.sample_weights(log_std, batch_size=N_SAMPLES) dist2.sample_weights(log_std, batch_size=N_SAMPLES) dist1 = dist1.proba_distribution(mean_actions1, log_std, state) dist2 = dist2.proba_distribution(mean_actions2, log_std, state) full_kl_div = kl_divergence(dist1, dist2).mean(dim=0) actions = dist1.get_actions() approx_kl_div = (dist1.log_prob(actions) - dist2.log_prob(actions)).mean(dim=0) assert th.allclose(full_kl_div, approx_kl_div, rtol=5e-2) # Test 3 Sanity test with easy Bernoulli distribution if isinstance(dist_type, BernoulliDistribution): dist1 = BernoulliDistribution(1).proba_distribution(th.tensor([0.3])) dist2 = BernoulliDistribution(1).proba_distribution(th.tensor([0.65])) full_kl_div = kl_divergence(dist1, dist2) actions = th.tensor([0.0, 1.0]) ad_hoc_kl = th.sum( th.exp(dist1.distribution.log_prob(actions)) * (dist1.distribution.log_prob(actions) - dist2.distribution.log_prob(actions)) ) assert th.allclose(full_kl_div, ad_hoc_kl)
def test_categorical(dist, CAT_ACTIONS): # The entropy can be approximated by averaging the negative log likelihood # mean negative log likelihood == entropy set_random_seed(1) action_logits = th.rand(N_SAMPLES, CAT_ACTIONS) dist = dist.proba_distribution(action_logits) actions = dist.get_actions() entropy = dist.entropy() log_prob = dist.log_prob(actions) assert th.allclose(entropy.mean(), -log_prob.mean(), rtol=5e-3)
def make_env(rank, sparse=False, seed=0): """ Utility function for multiprocessed env. """ def _init(): env = MazeGridEnv() if sparse: env = SparseRewardWrapper(env) env.seed(seed + rank) return env set_random_seed(seed) return _init
def set_random_seed(self, seed: Optional[int] = None) -> None: """ Set the seed of the pseudo-random generators (python, numpy, pytorch, gym, action_space) :param seed: """ if seed is None: return set_random_seed(seed, using_cuda=self.device.type == th.device("cuda").type) self.attacker_action_space.seed(seed) if self.env is not None: self.env.seed(seed)
def run(params): device, use_cuda = helper.get_pytorch_device() sb3_utils.set_random_seed(params.seed, using_cuda=use_cuda) env = helper.make_env(params) model = network.get_model_class(params)(env).to(device) optimizer = optim.Adam(model.parameters(), lr=params.learning_rate) ep_no = 0 total_rewards = [] while ep_no < params.num_episodes: ep_no += 1 state_vals, log_probs = [], [] # unroll the policy state = env.reset() rewards = [] # for each episode, only run 10_000 steps so that we don't # infinite loop while learning for t in range(10_000): state = torch.from_numpy(state).float().unsqueeze(0).to(device) probs, state_value = model(state) c = Categorical(probs) action = c.sample() log_probs.append(c.log_prob(action)) state_vals.append(state_value) state, reward, done, info = env.step(action.item()) rewards.append(reward) if done: break total_rewards.append(sum(rewards)) reinforce_helper.log_results(ep_no, total_rewards, info, t, params) returns = reinforce_helper.discount_rewards(rewards, params) # backprop state_vals = torch.stack(state_vals).squeeze() returns = returns.to(device) policy_loss = (-torch.stack(log_probs).squeeze(1) * (returns - state_vals).detach()).mean() baseline_loss = F.smooth_l1_loss(state_vals, returns, reduction='mean') # reset gradients optimizer.zero_grad() loss = policy_loss + params.scaling_factor * baseline_loss loss.backward() optimizer.step()
def test_sde_distribution(): n_actions = 1 deterministic_actions = th.ones(N_SAMPLES, n_actions) * 0.1 state = th.ones(N_SAMPLES, N_FEATURES) * 0.3 dist = StateDependentNoiseDistribution(n_actions, full_std=True, squash_output=False) set_random_seed(1) _, log_std = dist.proba_distribution_net(N_FEATURES) dist.sample_weights(log_std, batch_size=N_SAMPLES) dist = dist.proba_distribution(deterministic_actions, log_std, state) actions = dist.get_actions() assert th.allclose(actions.mean(), dist.distribution.mean.mean(), rtol=2e-3) assert th.allclose(actions.std(), dist.distribution.scale.mean(), rtol=2e-3)
def make_env(env_id, rank, seed=0): """ Utility function for multiprocessed env. :param env_id: (str) the environment ID :param num_env: (int) the number of environments you wish to have in subprocesses :param seed: (int) the inital seed for RNG :param rank: (int) index of the subprocess """ def _init(): env = gym.make(env_id) env.seed(seed + rank) return env set_random_seed(seed) return _init
def make_env(env_param, rank, seed=0): """ Utility function for multiprocessed env. :param env_param: (dict) the environment params :param seed: (int) the inital seed for RNG :param rank: (int) index of the subprocess """ def _init(): e = env_change_input(**env_param) e.seed(seed + rank) return e set_random_seed(seed) return _init
def make_env(env: gym.Env, rank: int, seed: int = 0) -> Callable: """ Utility function for multiprocessed env. :param env_id: (str) the environment ID :param num_env: (int) the number of environment you wish to have in subprocesses :param seed: (int) the inital seed for RNG :param rank: (int) index of the subprocess :return: (Callable) """ def _init() -> gym.Env: env.seed(seed + rank) return env set_random_seed(seed) return _init
def make_gym_env(env_id, rank, seed=0): """ Utility function for multiprocessed env. :param env_id: (str) the environment ID :param seed: (int) the inital seed for RNG :param rank: (int) index of the subprocess """ def _init(): env = gym.make(env_id, reward_type="dense") env = gym.wrappers.FlattenObservation(env) env = Monitor(env) env.seed(seed + rank) return env set_random_seed(seed) return _init
def make_training_env(env_id, options, rank, seed=0): """ Utility function for multiprocessed env. :param env_id: (str) the environment ID :param options: (dict) additional arguments to pass to the specific environment class initializer :param seed: (int) the inital seed for RNG :param rank: (int) index of the subprocess """ def _init(): register_gripper(UltrasoundProbeGripper) env = GymWrapper(suite.make(env_id, **options)) env = Monitor(env) env.seed(seed + rank) return env set_random_seed(seed) return _init
def make_env(env_id: str, rank: int, seed: int = 1, log_dir=None) -> Callable: ''' Utility function for multiprocessed env. :param env_id: (str) the environment ID :param num_env: (int) the number of environment you wish to have in subprocesses :param seed: (int) the inital seed for RNG :param rank: (int) index of the subprocess :return: (Callable) ''' def _init() -> gym.Env: env = gym.make(env_id) env = Monitor(env, log_dir) env.seed(seed + rank) return env set_random_seed(seed) return _init
def test_entropy(dist): # The entropy can be approximated by averaging the negative log likelihood # mean negative log likelihood == differential entropy set_random_seed(1) state = th.rand(N_SAMPLES, N_FEATURES) deterministic_actions = th.rand(N_SAMPLES, N_ACTIONS) _, log_std = dist.proba_distribution_net(N_FEATURES, log_std_init=th.log(th.tensor(0.2))) if isinstance(dist, DiagGaussianDistribution): dist = dist.proba_distribution(deterministic_actions, log_std) else: dist.sample_weights(log_std, batch_size=N_SAMPLES) dist = dist.proba_distribution(deterministic_actions, log_std, state) actions = dist.get_actions() entropy = dist.entropy() log_prob = dist.log_prob(actions) assert th.allclose(entropy.mean(), -log_prob.mean(), rtol=5e-3)
def _init(): set_random_seed(seed + rank) env = gym.make(env_id, **env_kwargs) # Wrap first with a monitor (e.g. for Atari env where reward clipping is used) log_file = os.path.join(log_dir, str(rank)) if log_dir is not None else None # Monitor success rate too for the real robot info_keywords = ('is_success', ) if 'NeckEnv' in env_id else () env = Monitor(env, log_file, info_keywords=info_keywords) # Dict observation space is currently not supported. # https://github.com/hill-a/stable-baselines/issues/321 # We allow a Gym env wrapper (a subclass of gym.Wrapper) if wrapper_class: env = wrapper_class(env) env.seed(seed + rank) return env
def run(experiment: Experiment, params: argparse.Namespace): sb3_utils.set_random_seed(params.seed, using_cuda=use_cuda) env = helper.make_env(params, 'env') # Logs will be saved in log_dir/monitor.csv env = Monitor(env) with experiment.train(): callback = SaveOnBestTrainingRewardCallback(experiment, check_freq=1000) # Deactivate all the DQN extensions to have the original version # In practice, it is recommend to have them activated model = DQN(CnnPolicy, env, learning_rate=params.learning_rate, gamma=params.gamma, seed=params.seed, max_grad_norm=params.max_grad_norm, verbose=1, device=device, policy_kwargs={'features_extractor_class': ColoringCNN}) model.learn(total_timesteps=params.max_ts, callback=callback)
def make_env(env_id, rank, seed=0): """ Utility function for multiprocessed env. :param env_id: (str) the environment ID :param num_env: (int) the number of environments you wish to have in subprocesses :param seed: (int) the inital seed for RNG :param rank: (int) index of the subprocess """ def _init(): if rank == 1: env = gym.make("fishing-v2", C=0.2) elif rank == 2: env = gym.make("fishing-v2", C=0.1) elif rank == 3: env = gym.make("fishing-v1") elif rank == 0: env = gym.make("fishing-v1", r=0.1) env.seed(seed) return env set_random_seed(seed) return _init
def make_env(rank, seed=0): num_boxes = 1 dim_room = (7, 7) train_mode = 'mlp' max_steps = 20 """ Utility function for multiprocessed env. :param env_id: (str) the environment ID :param num_env: (int) the number of environments you wish to have in subprocesses :param seed: (int) the inital seed for RNG :param rank: (int) index of the subprocess """ def _init(): env = SokobanEnv(dim_room=dim_room, max_steps=max_steps, num_boxes=num_boxes, train_mode=train_mode) env.seed(seed + rank) return env set_random_seed(seed) return _init
if env_id not in registered_envs: try: closest_match = difflib.get_close_matches(env_id, registered_envs, n=1)[0] except IndexError: closest_match = "'no close match found...'" raise ValueError( f"{env_id} not found in gym registry, you maybe meant {closest_match}?" ) # Unique id to ensure there is no race condition for the folder creation uuid_str = f"_{uuid.uuid4()}" if args.uuid else "" if args.seed < 0: # Seed but with a random one args.seed = np.random.randint(2 ** 32 - 1, dtype="int64").item() set_random_seed(args.seed) # Setting num threads to 1 makes things run faster on cpu if args.num_threads > 0: if args.verbose > 1: print(f"Setting torch.num_threads to {args.num_threads}") th.set_num_threads(args.num_threads) if args.trained_agent != "": assert args.trained_agent.endswith(".zip") and os.path.isfile( args.trained_agent ), "The trained_agent must be a valid path to a .zip file" print("=" * 10, env_id, "=" * 10) print(f"Seed: {args.seed}")
from stable_baselines3.common.vec_env.vec_frame_stack import VecFrameStack from stable_baselines3.common.callbacks import CheckpointCallback import common.common as common import common.wrapper as wrapper import common.gym_interface as gym_interface import common.callbacks as callbacks from common.activation_fn import MyThreshold if __name__ == "__main__": args = common.args print(args) # SAC.learn need this. If use SubprocVecEnv instead of DummyVecEnv, you need to seed in each subprocess. set_random_seed(common.seed) saved_model_filename = common.build_model_filename(args) hyperparams = common.load_hyperparameters(conf_name="PPO") print(hyperparams) # Make every env has the same obs space and action space default_wrapper = [] # if padding zero: # default_wrapper.append(wrapper.WalkerWrapper) if args.realign_method != "": default_wrapper.append(wrapper.ReAlignedWrapper) assert len(args.train_bodies) > 0, "No body to train."
closest_match = difflib.get_close_matches(env_id, registered_envs, n=1)[0] except IndexError: closest_match = "'no close match found...'" raise ValueError( f"{env_id} not found in gym registry, you maybe meant {closest_match}?" ) # Unique id to ensure there is no race condition for the folder creation uuid_str = f"_{uuid.uuid4()}" if args.uuid else "" if args.seed < 0: # Seed but with a random one args.seed = np.random.randint(2**32 - 1, dtype="int64").item() set_random_seed(args.seed, True) # Setting num threads to 1 makes things run faster on cpu if args.num_threads > 0: if args.verbose > 1: print(f"Setting torch.num_threads to {args.num_threads}") th.set_num_threads(args.num_threads) if args.trained_agent != "": assert args.trained_agent.endswith(".zip") and os.path.isfile( args.trained_agent ), "The trained_agent must be a valid path to a .zip file" print("=" * 10, env_id, "=" * 10) print(f"Seed: {args.seed}")
def main(): # noqa: C901 parser = argparse.ArgumentParser() parser.add_argument("--env", help="environment ID", type=str, default="CartPole-v1") parser.add_argument("-f", "--folder", help="Log folder", type=str, default="rl-trained-agents") parser.add_argument("--algo", help="RL Algorithm", default="ppo", type=str, required=False, choices=list(ALGOS.keys())) parser.add_argument("-n", "--n-timesteps", help="number of timesteps", default=1000, type=int) parser.add_argument( "--num-threads", help="Number of threads for PyTorch (-1 to use default)", default=-1, type=int) parser.add_argument("--n-envs", help="number of environments", default=1, type=int) parser.add_argument( "--exp-id", help="Experiment ID (default: 0: latest, -1: no exp folder)", default=0, type=int) parser.add_argument("--verbose", help="Verbose mode (0: no output, 1: INFO)", default=1, type=int) parser.add_argument( "--no-render", action="store_true", default=False, help="Do not render the environment (useful for tests)") parser.add_argument("--deterministic", action="store_true", default=False, help="Use deterministic actions") parser.add_argument( "--load-best", action="store_true", default=False, help="Load best model instead of last model if available") parser.add_argument( "--load-checkpoint", type=int, help="Load checkpoint instead of last model if available, " "you must pass the number of timesteps corresponding to it", ) parser.add_argument("--stochastic", action="store_true", default=False, help="Use stochastic actions") parser.add_argument( "--norm-reward", action="store_true", default=False, help="Normalize reward if applicable (trained with VecNormalize)") parser.add_argument("--seed", help="Random generator seed", type=int, default=0) parser.add_argument("--reward-log", help="Where to log reward", default="", type=str) parser.add_argument( "--gym-packages", type=str, nargs="+", default=[], help= "Additional external Gym environment package modules to import (e.g. gym_minigrid)", ) parser.add_argument( "--env-kwargs", type=str, nargs="+", action=StoreDict, help="Optional keyword argument to pass to the env constructor") args = parser.parse_args() # Going through custom gym packages to let them register in the global registory for env_module in args.gym_packages: importlib.import_module(env_module) env_id = args.env algo = args.algo folder = args.folder if args.exp_id == 0: args.exp_id = get_latest_run_id(os.path.join(folder, algo), env_id) print(f"Loading latest experiment, id={args.exp_id}") # Sanity checks if args.exp_id > 0: log_path = os.path.join(folder, algo, f"{env_id}_{args.exp_id}") else: log_path = os.path.join(folder, algo) assert os.path.isdir(log_path), f"The {log_path} folder was not found" found = False for ext in ["zip"]: model_path = os.path.join(log_path, f"{env_id}.{ext}") found = os.path.isfile(model_path) if found: break if args.load_best: model_path = os.path.join(log_path, "best_model.zip") found = os.path.isfile(model_path) if args.load_checkpoint is not None: model_path = os.path.join( log_path, f"rl_model_{args.load_checkpoint}_steps.zip") found = os.path.isfile(model_path) if not found: raise ValueError( f"No model found for {algo} on {env_id}, path: {model_path}") off_policy_algos = ["qrdqn", "dqn", "ddpg", "sac", "her", "td3", "tqc"] if algo in off_policy_algos: args.n_envs = 1 set_random_seed(args.seed) if args.num_threads > 0: if args.verbose > 1: print(f"Setting torch.num_threads to {args.num_threads}") th.set_num_threads(args.num_threads) is_atari = ExperimentManager.is_atari(env_id) stats_path = os.path.join(log_path, env_id) hyperparams, stats_path = get_saved_hyperparams( stats_path, norm_reward=args.norm_reward, test_mode=True) # load env_kwargs if existing env_kwargs = {} args_path = os.path.join(log_path, env_id, "args.yml") if os.path.isfile(args_path): with open(args_path, "r") as f: loaded_args = yaml.load(f, Loader=yaml.UnsafeLoader) # pytype: disable=module-attr if loaded_args["env_kwargs"] is not None: env_kwargs = loaded_args["env_kwargs"] # overwrite with command line arguments if args.env_kwargs is not None: env_kwargs.update(args.env_kwargs) log_dir = args.reward_log if args.reward_log != "" else None print(env_kwargs) env = create_test_env( env_id, n_envs=args.n_envs, stats_path=stats_path, seed=args.seed, log_dir=log_dir, should_render=not args.no_render, hyperparams=hyperparams, env_kwargs=env_kwargs, ) kwargs = dict(seed=args.seed) if algo in off_policy_algos: # Dummy buffer size as we don't need memory to enjoy the trained agent kwargs.update(dict(buffer_size=1)) # Check if we are running python 3.8+ # we need to patch saved model under python 3.6/3.7 to load them newer_python_version = sys.version_info.major == 3 and sys.version_info.minor >= 8 custom_objects = {} if newer_python_version: custom_objects = { "learning_rate": 0.0, "lr_schedule": lambda _: 0.0, "clip_range": lambda _: 0.0, } model = ALGOS[algo].load(model_path, env=env, custom_objects=custom_objects, **kwargs) obs = env.reset() # Deterministic by default except for atari games stochastic = args.stochastic or is_atari and not args.deterministic deterministic = not stochastic state = None episode_reward = 0.0 episode_rewards, episode_lengths = [], [] ep_len = 0 # For HER, monitor success rate successes = [] plt.figure(f"Enjoy {env_id}") plt.title(f"{env_id}", fontsize=14) plt.xlabel(f"Timesteps", fontsize=14) # plt.ylabel("Score", fontsize=14) observations = [] rewards = [] infos = [] try: for _ in range(args.n_timesteps): action, state = model.predict(obs, state=state, deterministic=deterministic) obs, reward, done, info = env.step(action) if not args.no_render: env.render("human") episode_reward += reward[0] ep_len += 1 observations.append(obs) rewards.append(reward) infos.append(info[0].get("coating")) if args.n_envs == 1: # For atari the return reward is not the atari score # so we have to get it from the infos dict if is_atari and infos is not None and args.verbose >= 1: episode_infos = infos[0].get("episode") if episode_infos is not None: print(f"Atari Episode Score: {episode_infos['r']:.2f}") print("Atari Episode Length", episode_infos["l"]) if done and not is_atari and args.verbose > 0: # NOTE: for env using VecNormalize, the mean reward # is a normalized reward when `--norm_reward` flag is passed print(f"Episode Reward: {episode_reward:.2f}") print("Episode Length", ep_len) episode_rewards.append(episode_reward) episode_lengths.append(ep_len) episode_reward = 0.0 ep_len = 0 state = None # Reset also when the goal is achieved when using HER if done and infos[0].get("is_success") is not None: if args.verbose > 1: print("Success?", infos[0].get("is_success", False)) if infos[0].get("is_success") is not None: successes.append(infos[0].get("is_success", False)) episode_reward, ep_len = 0.0, 0 except KeyboardInterrupt: pass if args.verbose > 0 and len(successes) > 0: print(f"Success rate: {100 * np.mean(successes):.2f}%") if args.verbose > 0 and len(episode_rewards) > 0: print(f"{len(episode_rewards)} Episodes") print( f"Mean reward: {np.mean(episode_rewards):.2f} +/- {np.std(episode_rewards):.2f}" ) if args.verbose > 0 and len(episode_lengths) > 0: print( f"Mean episode length: {np.mean(episode_lengths):.2f} +/- {np.std(episode_lengths):.2f}" ) env.close() gesamt = 0 gesamt_mit = 0 for el in rewards: if (el > 0): gesamt += el gesamt_mit += el print(f"Gesamt reward: {gesamt}") print(f"Gesamt reward mit: {gesamt_mit}") plt.plot(np.arange(len(observations)), rewards, label="reward", linewidth=1) plt.plot(np.arange(len(observations)), [obs[0][3] * 202 + 8 for obs in observations], label="coating_dist", linewidth=1) plt.plot(np.arange(len(observations)), [obs[0][1] * 202 + 8 for obs in observations], label="coating_targets", linewidth=1) plt.plot(np.arange(len(observations)), infos, label="coating_real", linewidth=1) plt.plot(np.arange(len(observations)), [obs[0][4] * 700 for obs in observations], label="pressure", linewidth=1) plt.legend() plt.show()
def main(args=None): # Check if the selected environment is valid # If it could not be found, suggest the closest match registered_envs = set(gym.envs.registry.env_specs.keys()) if args.env not in registered_envs: try: closest_match = difflib.get_close_matches(args.env, registered_envs, n=1)[0] except IndexError: closest_match = "'no close match found...'" raise ValueError( f"{args.env} not found in gym registry, you maybe meant {closest_match}?" ) # If no specific seed is selected, choose a random one if args.seed < 0: args.seed = np.random.randint(2**32 - 1, dtype="int64").item() # Set the random seed across platforms set_random_seed(args.seed) # Setting num threads to 1 makes things run faster on cpu if args.num_threads > 0: if args.verbose > 1: print(f"Setting torch.num_threads to {args.num_threads}") th.set_num_threads(args.num_threads) # Verify that pre-trained agent exists before continuing to train it if args.trained_agent != "": assert args.trained_agent.endswith(".zip") and os.path.isfile( args.trained_agent ), "The trained_agent must be a valid path to a .zip file" # If enabled, ensure that the run has a unique ID uuid_str = f"_{uuid.uuid4()}" if args.uuid else "" print("=" * 10, args.env, "=" * 10) print(f"Seed: {args.seed}") exp_manager = ExperimentManager( args, args.algo, args.env, args.log_folder, args.tensorboard_log, args.n_timesteps, args.eval_freq, args.eval_episodes, args.save_freq, args.hyperparams, args.env_kwargs, args.trained_agent, args.optimize_hyperparameters, args.storage, args.study_name, args.n_trials, args.n_jobs, args.sampler, args.pruner, n_startup_trials=args.n_startup_trials, n_evaluations=args.n_evaluations, truncate_last_trajectory=args.truncate_last_trajectory, uuid_str=uuid_str, seed=args.seed, log_interval=args.log_interval, save_replay_buffer=args.save_replay_buffer, preload_replay_buffer=args.preload_replay_buffer, verbose=args.verbose, vec_env_type=args.vec_env, ) # Prepare experiment and launch hyperparameter optimization if needed model = exp_manager.setup_experiment() if args.optimize_hyperparameters: exp_manager.hyperparameters_optimization() else: exp_manager.learn(model) exp_manager.save_trained_model(model)
def main(): # noqa: C901 parser = argparse.ArgumentParser() parser.add_argument("--env", help="environment ID", type=str, default="Walker2DBulletEnv-v0") parser.add_argument("--algo", help="RL Algorithm", default="ppo", type=str, required=False, choices=list(ALGOS.keys())) parser.add_argument("-n", "--n-timesteps", help="number of timesteps", default=1000, type=int) parser.add_argument( "--num-threads", help="Number of threads for PyTorch (-1 to use default)", default=-1, type=int) parser.add_argument("--n-envs", help="number of environments", default=1, type=int) parser.add_argument( "--exp-id", help="Experiment ID (default: 0: latest, -1: no exp folder)", default=0, type=int) parser.add_argument("--verbose", help="Verbose mode (0: no output, 1: INFO)", default=1, type=int) parser.add_argument( "--no-render", action="store_true", default=False, help="Do not render the environment (useful for tests)") parser.add_argument("--deterministic", action="store_true", default=True, help="Use deterministic actions") parser.add_argument( "--load-best", action="store_true", default=False, help="Load best model instead of last model if available") parser.add_argument("--stochastic", action="store_true", default=False, help="Use stochastic actions (for DDPG/DQN/SAC)") parser.add_argument( "--norm-reward", action="store_true", default=False, help="Normalize reward if applicable (trained with VecNormalize)") parser.add_argument("--seed", help="Random generator seed", type=int, default=0) parser.add_argument("--reward-log", help="Where to log reward", default="", type=str) parser.add_argument( "--gym-packages", type=str, nargs="+", default=[], help= "Additional external Gym environemnt package modules to import (e.g. gym_minigrid)", ) parser.add_argument( "--env-kwargs", type=str, nargs="+", action=StoreDict, help="Optional keyword argument to pass to the env constructor") # === # parser.add_argument("--load-checkpoint", type=str, help="pass the path of zip file corresponding to it") parser.add_argument("-f", "--folder", help="Log folder", type=str, default="rl-trained-agents") parser.add_argument("--dataset", type=str, default="dataset/walker2d_v6") parser.add_argument("--body-id", type=int, default=0) args = parser.parse_args() dataset_name, env_id, train_files, train_params, train_names, test_files, test_params, test_names = load_dataset.load_dataset( args.dataset, seed=0, shuffle=False, train_proportion=1) # Going through custom gym packages to let them register in the global registory for env_module in args.gym_packages: importlib.import_module(env_module) # env_id = args.env algo = args.algo log_path = args.folder # if args.exp_id == 0: # args.exp_id = get_latest_run_id(os.path.join(folder, algo), env_id) # print(f"Loading latest experiment, id={args.exp_id}") # # Sanity checks # if args.exp_id > 0: # log_path = os.path.join(folder, algo, f"{env_id}_{args.exp_id}") # else: # log_path = os.path.join(folder, algo) # assert os.path.isdir(log_path), f"The {log_path} folder was not found" # found = False # for ext in ["zip"]: # model_path = os.path.join(log_path, f"{env_id}.{ext}") # found = os.path.isfile(model_path) # if found: # break # if args.load_best: # model_path = os.path.join(log_path, "best_model.zip") # found = os.path.isfile(model_path) # if args.load_checkpoint is not None: # model_path = os.path.join(log_path, f"rl_model_{args.load_checkpoint}_steps.zip") # found = os.path.isfile(model_path) # if not found: # raise ValueError(f"No model found for {algo} on {env_id}, path: {model_path}") model_path = args.load_checkpoint if algo in ["dqn", "ddpg", "sac", "td3", "tqc"]: args.n_envs = 1 set_random_seed(args.seed) if args.num_threads > 0: if args.verbose > 1: print(f"Setting torch.num_threads to {args.num_threads}") th.set_num_threads(args.num_threads) is_atari = "NoFrameskip" in env_id stats_path = os.path.join(log_path, env_id) hyperparams, stats_path = get_saved_hyperparams( stats_path, norm_reward=args.norm_reward, test_mode=True) # load env_kwargs if existing # env_kwargs = {} # args_path = os.path.join(log_path, env_id, "args.yml") # if os.path.isfile(args_path): # with open(args_path, "r") as f: # loaded_args = yaml.load(f, Loader=yaml.UnsafeLoader) # pytype: disable=module-attr # if loaded_args["env_kwargs"] is not None: # env_kwargs = loaded_args["env_kwargs"] # # overwrite with command line arguments # if args.env_kwargs is not None: # env_kwargs.update(args.env_kwargs) args.watch_eval = True env_kwargs = { "xml": train_files[args.body_id], "param": train_params[args.body_id], "render": args.watch_eval, } log_dir = args.reward_log if args.reward_log != "" else None env = create_test_env( env_id, n_envs=args.n_envs, stats_path=stats_path, seed=args.seed, log_dir=log_dir, should_render=not args.no_render, hyperparams=hyperparams, env_kwargs=env_kwargs, ) kwargs = dict(seed=args.seed) if algo in ["dqn", "ddpg", "sac", "her", "td3", "tqc"]: # Dummy buffer size as we don't need memory to enjoy the trained agent kwargs.update(dict(buffer_size=1)) model = ALGOS[algo].load(model_path, env=env, **kwargs) obs = env.reset() # Force deterministic for DQN, DDPG, SAC and HER (that is a wrapper around) deterministic = args.deterministic or algo in [ "dqn", "ddpg", "sac", "her", "td3", "tqc" ] and not args.stochastic state = None episode_reward = 0.0 episode_rewards, episode_lengths = [], [] ep_len = 0 # For HER, monitor success rate successes = [] for _ in range(args.n_timesteps): action, state = model.predict(obs, state=state, deterministic=deterministic) # Random Agent # action = [env.action_space.sample()] # Clip Action to avoid out of bound errors if isinstance(env.action_space, gym.spaces.Box): action = np.clip(action, env.action_space.low, env.action_space.high) obs, reward, done, infos = env.step(action) sleep(0.01) if not args.no_render: env.render("human") episode_reward += reward[0] ep_len += 1 if args.n_envs == 1: # For atari the return reward is not the atari score # so we have to get it from the infos dict if is_atari and infos is not None and args.verbose >= 1: episode_infos = infos[0].get("episode") if episode_infos is not None: print(f"Atari Episode Score: {episode_infos['r']:.2f}") print("Atari Episode Length", episode_infos["l"]) if done and not is_atari and args.verbose > 0: # NOTE: for env using VecNormalize, the mean reward # is a normalized reward when `--norm_reward` flag is passed print(f"Episode Reward: {episode_reward:.2f}") print("Episode Length", ep_len) episode_rewards.append(episode_reward) episode_lengths.append(ep_len) episode_reward = 0.0 ep_len = 0 state = None # Reset also when the goal is achieved when using HER if done and infos[0].get("is_success") is not None: if args.verbose > 1: print("Success?", infos[0].get("is_success", False)) # Alternatively, you can add a check to wait for the end of the episode if done: obs = env.reset() if infos[0].get("is_success") is not None: successes.append(infos[0].get("is_success", False)) episode_reward, ep_len = 0.0, 0 if args.verbose > 0 and len(successes) > 0: print("Success rate: {:.2f}%".format(100 * np.mean(successes))) if args.verbose > 0 and len(episode_rewards) > 0: print("Mean reward: {:.2f} +/- {:.2f}".format(np.mean(episode_rewards), np.std(episode_rewards))) if args.verbose > 0 and len(episode_lengths) > 0: print("Mean episode length: {:.2f} +/- {:.2f}".format( np.mean(episode_lengths), np.std(episode_lengths))) # Workaround for https://github.com/openai/gym/issues/893 if not args.no_render: if args.n_envs == 1 and "Bullet" not in env_id and not is_atari and isinstance( env, VecEnv): # DummyVecEnv # Unwrap env while isinstance(env, VecEnvWrapper): env = env.venv if isinstance(env, DummyVecEnv): env.envs[0].env.close() else: env.close() else: # SubprocVecEnv env.close()
import utils if __name__ == "__main__": # noqa: C901 folder = utils.folder os.makedirs(folder, exist_ok=True) hyperparams = utils.load_hyperparameters() normalize_kwargs = {} normalize_kwargs["gamma"] = hyperparams["gamma"] args = utils.args # PPO.learn need this. If use SubprocVecEnv instead of DummyVecEnv, you need to seed in each subprocess. set_random_seed(utils.seed) debug = args.debug train_on_both_bodies = args.train_on_both_bodies with_bodyinfo = args.with_bodyinfo train_num_envs = 16 if not debug else 2 total_timesteps = 5e6 if not debug else 1 if train_on_both_bodies: training_bodies = args.train_bodies print(training_bodies) if with_bodyinfo: env = DummyVecEnv([utils.make_env(rank=i, seed=utils.seed, render=args.render, robot_body=training_bodies[i%2], body_info=training_bodies[i%2]) for i in range(train_num_envs)]) save_filename = f"model-ant-{training_bodies[0]}-{training_bodies[1]}-with-bodyinfo" else:
def main(): # noqa: C901 parser = argparse.ArgumentParser() parser.add_argument( "--env", help="environment ID", type=str, default="CartPole-v1") parser.add_argument( "-f", "--log-folder", help="Log folder", type=str, default="rl-trained-agents") parser.add_argument( "--algo", help="RL Algorithm", default="ppo", type=str, required=False, choices=list(ALGOS.keys())) parser.add_argument( "-n", "--n-eval-steps", help="Number of evaluation timesteps", default=1000, type=int) parser.add_argument( "--num-threads", help="Number of threads for PyTorch (-1 to use default)", default=-1, type=int) parser.add_argument( "--n-envs", help="number of environments", default=1, type=int) parser.add_argument( "--exp-id", help="Experiment ID (default: 0: latest, -1: no exp folder)", default=0, type=int) parser.add_argument( "--verbose", help="Verbose mode (0: no output, 1: INFO)", default=1, type=int) parser.add_argument( '--render', help="1: Render environment, 0: don't render", type=int, choices=[0, 1], default=0) parser.add_argument( '--deterministic', help="1: Use deterministic actions, 0: Use stochastic actions", type=int, choices=[0, 1], default=0) parser.add_argument( "--load-best", action="store_true", default=False, help="Load best model instead of last model if available") parser.add_argument( "--load-checkpoint", type=int, help="Load checkpoint instead of last model if available, " "you must pass the number of timesteps corresponding to it", ) parser.add_argument( "--stochastic", action="store_true", default=False, help="Use stochastic actions (for DDPG/DQN/SAC)") parser.add_argument( "--norm-reward", action="store_true", default=False, help="Normalize reward if applicable (trained with VecNormalize)") parser.add_argument( "--seed", help="Random generator seed", type=int, default=0) parser.add_argument( "--reward-log", help="Where to log reward", default="", type=str) parser.add_argument( "--gym-packages", type=str, nargs="+", default=[], help="Additional external Gym environemnt package modules to import (e.g. gym_minigrid)") parser.add_argument( "--env-kwargs", type=str, nargs="+", action=StoreDict, help="Optional keyword argument to pass to the env constructor") parser.add_argument( '--log-info', help="1: Log information at each evaluation steps and save, 0: don't log", type=int, choices=[0, 1], default=0) parser.add_argument( "--plot-dim", help="Plot end effector and goal position in real time (0: Don't plot, 2: 2D (default), 3: 3D)", type=int, default=0, choices=[0, 2, 3]) args = parser.parse_args() ################################# # Prepare log if needed if args.log_info: log_df = pd.DataFrame() log_dict = OrderedDict() # Prepare plot if needed if args.plot_dim == 2: fig, (ax1, ax2) = plt.subplots(2, 1, sharey=True, figsize=(5, 10)) elif args.plot_dim == 3: fig = plt.figure() ax = fig.gca(projection='3d') # Going through custom gym packages to let them register # in the global registry for env_module in args.gym_packages: importlib.import_module(env_module) env_id = args.env algo = args.algo folder = args.log_folder if args.exp_id == 0: args.exp_id = get_latest_run_id(os.path.join(folder, algo), env_id) print(f"Loading latest experiment, id={args.exp_id}") # Sanity checks if args.exp_id > 0: log_path = os.path.join(folder, algo, f"{env_id}_{args.exp_id}") else: log_path = os.path.join(folder, algo) assert os.path.isdir(log_path), f"The {log_path} folder was not found" found = False for ext in ["zip"]: model_path = os.path.join(log_path, f"{env_id}.{ext}") found = os.path.isfile(model_path) if found: break if args.load_best: model_path = os.path.join(log_path, "best_model.zip") found = os.path.isfile(model_path) if args.load_checkpoint is not None: model_path = os.path.join( log_path, f"rl_model_{args.load_checkpoint}_steps.zip") found = os.path.isfile(model_path) if not found: raise ValueError( f"No model found for {algo} on {env_id}, path: {model_path}") off_policy_algos = ["dqn", "ddpg", "sac", "her", "td3", "tqc"] if algo in off_policy_algos: args.n_envs = 1 set_random_seed(args.seed) if args.num_threads > 0: if args.verbose > 1: print(f"Setting torch.num_threads to {args.num_threads}") th.set_num_threads(args.num_threads) stats_path = os.path.join(log_path, env_id) hyperparams, stats_path = get_saved_hyperparams( stats_path, norm_reward=args.norm_reward, test_mode=True) # load env_kwargs if existing env_kwargs = {} args_path = os.path.join(log_path, env_id, "args.yml") if os.path.isfile(args_path): with open(args_path, "r") as f: # pytype: disable=module-attr loaded_args = yaml.load(f, Loader=yaml.UnsafeLoader) if loaded_args["env_kwargs"] is not None: env_kwargs = loaded_args["env_kwargs"] # overwrite with command line arguments if args.env_kwargs is not None: env_kwargs.update(args.env_kwargs) log_dir = args.reward_log if args.reward_log != "" else None env = create_test_env( env_id, n_envs=args.n_envs, stats_path=stats_path, seed=args.seed, log_dir=log_dir, should_render=args.render, hyperparams=hyperparams, env_kwargs=env_kwargs, ) kwargs = dict(seed=args.seed) if algo in off_policy_algos: # Dummy buffer size as we don't need memory to enjoy the trained agent kwargs.update(dict(buffer_size=1)) model = ALGOS[algo].load(model_path, env=env, **kwargs) obs = env.reset() # Force deterministic for DQN, DDPG, SAC and HER (that is a wrapper around) deterministic = args.deterministic or algo in off_policy_algos and not args.stochastic state = None episode_reward = 0.0 episode_rewards, episode_lengths = [], [] ep_len = 0 successes = [] # For HER, monitor success rate episode_nb = 0 success_threshold_50 = 0.05 success_threshold_20 = 0.02 success_threshold_10 = 0.01 success_threshold_5 = 0.005 success_threshold_2 = 0.002 success_threshold_1 = 0.001 success_threshold_05 = 0.0005 ep_success_list_50 = [] ep_success_list_20 = [] ep_success_list_10 = [] ep_success_list_5 = [] ep_success_list_2 = [] ep_success_list_1 = [] ep_success_list_05 = [] success_list_50 = [] success_list_20 = [] success_list_10 = [] success_list_5 = [] success_list_2 = [] success_list_1 = [] success_list_05 = [] # Moved render flag outside the loop (Pierre) if args.render: env.render("human") for t in range(args.n_eval_steps): action, state = model.predict( obs, state=state, deterministic=deterministic) obs, reward, done, infos = env.step(action) # Slow down simulation when rendering (Pierre) if args.render: if "widowx" in env_id: time.sleep(1. / 30.) else: env.render() if "widowx" in env_id: # Update episode success list ep_success_list_50 = calc_ep_success( success_threshold_50, ep_success_list_50, infos) ep_success_list_20 = calc_ep_success( success_threshold_20, ep_success_list_20, infos) ep_success_list_10 = calc_ep_success( success_threshold_10, ep_success_list_10, infos) ep_success_list_5 = calc_ep_success( success_threshold_5, ep_success_list_5, infos) ep_success_list_2 = calc_ep_success( success_threshold_2, ep_success_list_2, infos) ep_success_list_1 = calc_ep_success( success_threshold_1, ep_success_list_1, infos) ep_success_list_05 = calc_ep_success( success_threshold_05, ep_success_list_05, infos) episode_reward += reward[0] ep_len += 1 # Real time plot if args.plot_dim == 2: goal = infos[0]['goal_position'] tip = infos[0]['tip_position'] ax1.cla() ax1.plot(goal[0], goal[2], marker='o', color='g', linestyle='', markersize=10, label="goal", alpha=0.5) ax1.plot(tip[0], tip[2], marker='x', color='r', linestyle='', markersize=10, label="end effector", mew=3) circ_1_50 = plt.Circle( (goal[0], goal[2]), radius=success_threshold_50, edgecolor='g', facecolor='w', linestyle='--', label="50 mm") circ_1_20 = plt.Circle( (goal[0], goal[2]), radius=success_threshold_20, edgecolor='b', facecolor='w', linestyle='--', label="20 mm") circ_1_10 = plt.Circle( (goal[0], goal[2]), radius=success_threshold_10, edgecolor='m', facecolor='w', linestyle='--', label="10 mm") circ_1_5 = plt.Circle( (goal[0], goal[2]), radius=success_threshold_5, edgecolor='r', facecolor='w', linestyle='--', label="5 mm") ax1.add_patch(circ_1_50) ax1.add_patch(circ_1_20) ax1.add_patch(circ_1_10) ax1.add_patch(circ_1_5) ax1.set_xlim([-0.25, 0.25]) ax1.set_ylim([0, 0.5]) ax1.set_xlabel("x (m)", fontsize=15) ax1.set_ylabel("z (m)", fontsize=15) ax2.cla() ax2.plot(goal[1], goal[2], marker='o', color='g', linestyle='', markersize=10, alpha=0.5) ax2.plot( tip[1], tip[2], marker='x', color='r', linestyle='', markersize=10, mew=3) circ_2_50 = plt.Circle( (goal[1], goal[2]), radius=success_threshold_50, edgecolor='g', facecolor='w', linestyle='--') circ_2_20 = plt.Circle( (goal[1], goal[2]), radius=success_threshold_20, edgecolor='b', facecolor='w', linestyle='--') circ_2_10 = plt.Circle( (goal[1], goal[2]), radius=success_threshold_10, edgecolor='m', facecolor='w', linestyle='--') circ_2_5 = plt.Circle( (goal[1], goal[2]), radius=success_threshold_5, edgecolor='r', facecolor='w', linestyle='--') ax2.add_patch(circ_2_50) ax2.add_patch(circ_2_20) ax2.add_patch(circ_2_10) ax2.add_patch(circ_2_5) ax2.set_xlim([-0.25, 0.25]) ax2.set_ylim([0, 0.5]) ax2.set_xlabel("y (m)", fontsize=15) ax2.set_ylabel("z (m)", fontsize=15) ax1.legend(loc='upper left', bbox_to_anchor=( 0, 1.2), ncol=3, fancybox=True, shadow=True) fig.suptitle("timestep " + str(ep_len) + " | distance to target: " + str(round(infos[0]['new_distance'] * 1000, 1)) + " mm") plt.pause(0.01) # plt.show() elif args.plot_dim == 3: goal = infos[0]['goal_position'] tip = infos[0]['tip_position'] ax.cla() ax.plot([goal[0]], [goal[1]], zs=[goal[2]], marker='o', color='g', linestyle='', markersize=10, alpha=0.5) ax.plot([tip[0]], [tip[1]], zs=[tip[2]], marker='x', color='r', linestyle='', markersize=10, mew=3) ax.set_xlim([-0.2, 0.2]) ax.set_ylim([-0.2, 0.2]) ax.set_zlim([0, 0.5]) ax.set_xlabel("x (m)", fontsize=15) ax.set_ylabel("y (m)", fontsize=15) ax.set_zlabel("z (m)", fontsize=15) fig.suptitle("timestep " + str(ep_len) + " | distance to target: " + str(round(infos[0]['new_distance'] * 1000, 1)) + " mm") plt.pause(0.01) # plt.show() if args.log_info: log_dict['episode'] = episode_nb log_dict['timestep'] = t log_dict['action_1'] = action[0][0] log_dict['action_2'] = action[0][1] log_dict['action_3'] = action[0][2] log_dict['action_4'] = action[0][3] log_dict['action_5'] = action[0][4] log_dict['action_6'] = action[0][5] log_dict['old_joint_pos_1'] = infos[0]['old_joint_pos'][0] log_dict['old_joint_pos_2'] = infos[0]['old_joint_pos'][1] log_dict['old_joint_pos_3'] = infos[0]['old_joint_pos'][2] log_dict['old_joint_pos_4'] = infos[0]['old_joint_pos'][3] log_dict['old_joint_pos_5'] = infos[0]['old_joint_pos'][4] log_dict['old_joint_pos_6'] = infos[0]['old_joint_pos'][5] log_dict['new_joint_pos_1'] = infos[0]['new_joint_pos'][0] log_dict['new_joint_pos_2'] = infos[0]['new_joint_pos'][1] log_dict['new_joint_pos_3'] = infos[0]['new_joint_pos'][2] log_dict['new_joint_pos_4'] = infos[0]['new_joint_pos'][3] log_dict['new_joint_pos_5'] = infos[0]['new_joint_pos'][4] log_dict['new_joint_pos_6'] = infos[0]['new_joint_pos'][5] log_dict['joint_vel_1'] = infos[0]['joint_vel'][0] log_dict['joint_vel_2'] = infos[0]['joint_vel'][1] log_dict['joint_vel_3'] = infos[0]['joint_vel'][2] log_dict['joint_vel_4'] = infos[0]['joint_vel'][3] log_dict['joint_vel_5'] = infos[0]['joint_vel'][4] log_dict['joint_vel_6'] = infos[0]['joint_vel'][5] log_dict['joint1_min'] = -3.1 log_dict['joint1_max'] = 3.1 log_dict['joint2_min'] = -1.571 log_dict['joint2_max'] = 1.571 log_dict['joint3_min'] = -1.571 log_dict['joint3_max'] = 1.571 log_dict['joint4_min'] = -1.745 log_dict['joint4_max'] = 1.745 log_dict['joint5_min'] = -2.617 log_dict['joint5_max'] = 2.617 log_dict['joint6_min'] = 0.003 log_dict['joint6_max'] = 0.03 log_dict['action_low1'] = env.action_space.low[0] log_dict['action_low2'] = env.action_space.low[1] log_dict['action_low3'] = env.action_space.low[2] log_dict['action_low4'] = env.action_space.low[3] log_dict['action_low5'] = env.action_space.low[4] log_dict['action_low6'] = env.action_space.low[5] log_dict['action_high1'] = env.action_space.high[0] log_dict['action_high2'] = env.action_space.high[1] log_dict['action_high3'] = env.action_space.high[2] log_dict['action_high4'] = env.action_space.high[3] log_dict['action_high5'] = env.action_space.high[4] log_dict['action_high6'] = env.action_space.high[5] log_dict['reward'] = reward[0] log_dict['return'] = episode_reward log_dict['new_distance'] = infos[0]['new_distance'] log_dict['old_distance'] = infos[0]['old_distance'] log_dict['target_x'] = infos[0]['goal_position'][0] log_dict['target_y'] = infos[0]['goal_position'][1] log_dict['target_z'] = infos[0]['goal_position'][2] log_dict['tip_y'] = infos[0]['tip_position'][1] log_dict['tip_x'] = infos[0]['tip_position'][0] log_dict['tip_z'] = infos[0]['tip_position'][2] log_dict['done'] = done[0] # log_dict['obs'] = obs # log_dict['obs_space_low'] = env.observation_space.low # log_dict['obs_space_high'] = env.observation_space.high log_df = log_df.append(log_dict, ignore_index=True) if args.n_envs == 1: if done and args.verbose > 0: # NOTE: for env using VecNormalize, the mean reward # is a normalized reward when `--norm_reward` flag is passed # print(f"Episode Reward: {episode_reward:.2f}") # commented by Pierre # print("Episode Length", ep_len) # commented by Pierre episode_rewards.append(episode_reward) episode_lengths.append(ep_len) episode_nb += 1 if "widowx" in env_id: # append the last element of the episode success list when # episode is done success_list_50 = calc_success_list( ep_success_list_50, success_list_50) success_list_20 = calc_success_list( ep_success_list_20, success_list_20) success_list_10 = calc_success_list( ep_success_list_10, success_list_10) success_list_5 = calc_success_list( ep_success_list_5, success_list_5) success_list_2 = calc_success_list( ep_success_list_2, success_list_2) success_list_1 = calc_success_list( ep_success_list_1, success_list_1) success_list_05 = calc_success_list( ep_success_list_05, success_list_05) # If the episode is successful and it starts from an # unsucessful step, calculate reach time reachtime_list_50 = calc_reach_time(ep_success_list_50) reachtime_list_20 = calc_reach_time(ep_success_list_20) reachtime_list_10 = calc_reach_time(ep_success_list_10) reachtime_list_5 = calc_reach_time(ep_success_list_5) reachtime_list_2 = calc_reach_time(ep_success_list_2) reachtime_list_1 = calc_reach_time(ep_success_list_1) reachtime_list_05 = calc_reach_time(ep_success_list_05) if args.log_info: log_df = log_df[log_dict.keys()] # sort columns # add estimated tip velocity and acceleration (according to # the documentation, 1 timestep = 240 Hz) log_df['est_vel'] = log_df['new_distance'].diff() * 240 log_df['est_vel'].loc[0] = 0 # initial velocity is 0 log_df['est_acc'] = log_df['est_vel'].diff() * 240 log_df['est_acc'].loc[0] = 0 # initial acceleration is 0 log_df.to_csv( log_path + "/res_episode_" + str(episode_nb) + ".csv", index=False) # slow # log_df.to_pickle(log_path+"/res_episode_"+str(episode)+".pkl") # # fast # Reset for the new episode episode_reward = 0.0 ep_len = 0 state = None ep_success_list_50 = [] ep_success_list_20 = [] ep_success_list_10 = [] ep_success_list_5 = [] ep_success_list_2 = [] ep_success_list_1 = [] ep_success_list_05 = [] # Reset also when the goal is achieved when using HER if done and infos[0].get("is_success") is not None: if args.verbose > 1: print("Success?", infos[0].get("is_success", False)) # Alternatively, you can add a check to wait for the end of the # episode if done: obs = env.reset() if infos[0].get("is_success") is not None: successes.append(infos[0].get("is_success", False)) episode_reward, ep_len = 0.0, 0 if args.verbose > 0 and len(successes) > 0: print(f"Success rate: {100 * np.mean(successes):.2f}%") if args.verbose > 0 and len(episode_lengths) > 0: print( f"Mean episode length: {np.mean(episode_lengths):.2f} +/- {np.std(episode_lengths):.2f}") if args.verbose > 0 and len(episode_rewards) > 0: print( f"Mean reward: {np.mean(episode_rewards):.2f} +/- {np.std(episode_rewards):.2f}") if "widowx" in env_id: SR_mean_50, RT_mean_50 = calc_mean_successratio_reachtime( success_threshold_50, success_list_50, reachtime_list_50) SR_mean_20, RT_mean_20 = calc_mean_successratio_reachtime( success_threshold_20, success_list_20, reachtime_list_20) SR_mean_10, RT_mean_10 = calc_mean_successratio_reachtime( success_threshold_10, success_list_10, reachtime_list_10) SR_mean_5, RT_mean_5 = calc_mean_successratio_reachtime( success_threshold_5, success_list_5, reachtime_list_5) SR_mean_2, RT_mean_2 = calc_mean_successratio_reachtime( success_threshold_2, success_list_2, reachtime_list_2) SR_mean_1, RT_mean_1 = calc_mean_successratio_reachtime( success_threshold_1, success_list_1, reachtime_list_1) SR_mean_05, RT_mean_05 = calc_mean_successratio_reachtime( success_threshold_05, success_list_05, reachtime_list_05) # log metrics to stats.csv d = { "Eval mean reward": np.mean(episode_rewards), "Eval std": np.std(episode_rewards), "success ratio 50mm": SR_mean_50, "Average reach time 50mm": RT_mean_50, "success ratio 20mm": SR_mean_20, "Average reach time 20mm": RT_mean_20, "success ratio 10mm": SR_mean_10, "Average reach time 10mm": RT_mean_10, "success ratio 5mm": SR_mean_5, "Average reach time 5mm": RT_mean_5, "success ratio 2mm": SR_mean_2, "Average reach time 2mm": RT_mean_2, "success ratio 1mm": SR_mean_1, "Average reach time 1mm": RT_mean_1, "success ratio 0.5mm": SR_mean_05, "Average reach time 0.5mm": RT_mean_05 } # print("path:", log_path) df = pd.DataFrame(d, index=[0]) df.to_csv(log_path + "/stats.csv", index=False) # Workaround for https://github.com/openai/gym/issues/893 if args.render: if args.n_envs == 1 and "Bullet" not in env_id and isinstance( env, VecEnv): # DummyVecEnv # Unwrap env while isinstance(env, VecEnvWrapper): env = env.venv if isinstance(env, DummyVecEnv): env.envs[0].env.close() else: env.close() else: # SubprocVecEnv env.close()