Exemple #1
0
def make_env(rank, seed=0):
    num_boxes = 1
    alg_version = 0
    dim_room = (7, 7)
    train_mode = 'mlp'
    agent_lb_path = None
    agent_ub_path = None
    """
    Utility function for multiprocessed env.

    :param env_id: (str) the environment ID
    :param num_env: (int) the number of environments you wish to have in subprocesses
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    """
    def _init():
        env = ALGEnv(dim_room=dim_room,
                     num_boxes=num_boxes,
                     train_mode=train_mode,
                     alg_version=alg_version,
                     agent_lb_path=agent_lb_path,
                     agent_ub_path=agent_ub_path)
        env.seed(seed + rank)
        return env

    set_random_seed(seed)
    return _init
Exemple #2
0
def enjoy(stats_path,
          model_path,
          dataset,
          body_id,
          algo,
          n_timesteps=200,
          test_time=3,
          render=False,
          seed=0):
    dataset_name, env_id, train_files, train_params, train_names, test_files, test_params, test_names = load_dataset.load_dataset(
        dataset, seed=0, shuffle=False, train_proportion=1)
    set_random_seed(seed * 128 + 127)
    hyperparams, stats_path = get_saved_hyperparams(stats_path,
                                                    norm_reward=False,
                                                    test_mode=True)
    env_kwargs = {
        "xml": train_files[body_id],
        "param": train_params[body_id],
        "max_episode_steps": n_timesteps + 1,
        "render": render,
    }
    env = create_test_env(
        env_id,
        n_envs=1,
        stats_path=stats_path,
        seed=seed,
        log_dir="tmp/",
        should_render=False,
        hyperparams=hyperparams,
        env_kwargs=env_kwargs,
    )
    kwargs = dict(seed=seed)
    model = ALGOS[algo].load(model_path, env=env, **kwargs)
    obs = env.reset()
    state = None
    episode_reward = 0.0
    episode_rewards, episode_lengths = [], []
    ep_len = 0

    body_x_record = []
    for _run in range(test_time):
        body_x = 0
        for _step in range(n_timesteps):
            action, state = model.predict(obs, state=state, deterministic=True)
            if isinstance(env.action_space, gym.spaces.Box):
                action = np.clip(action, env.action_space.low,
                                 env.action_space.high)
            body_x = env.envs[0].robot.body_xyz[0]
            obs, reward, done, infos = env.step(action)
            episode_reward += reward[0]
            ep_len += 1
            if render:
                sleep(0.01)
            if done:
                break
        body_x_record.append(body_x)
        obs = env.reset()
    body_x_record = np.array(body_x_record)
    env.close()
    return body_x_record
def make_env(env_id, rank, seed=0, sigma=0.1):
    def _init():
        env = gym.make(env_id, sigma=sigma, r=(rank + 1) / 10)
        env.seed(seed + rank)
        return env

    set_random_seed(seed)
    return _init
Exemple #4
0
def make_env(env_id, rank, seed=0):
    def _init():
        env = gym.make(env_id)
        env.seed(seed + rank)
        return env

    set_random_seed(seed)
    return _init
Exemple #5
0
def set_seed(seed=seed):
    os.environ['PYTHONHASHSEED'] = str(seed)

    _, seed = seeding.np_random(seed)
    random.seed(seed)
    set_random_seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
def test_kl_divergence(dist_type):
    set_random_seed(8)
    # Test 1: same distribution should have KL Div = 0
    dist1 = dist_type
    dist2 = dist_type
    # PyTorch implementation of kl_divergence doesn't sum across dimensions
    assert th.allclose(kl_divergence(dist1, dist2).sum(), th.tensor(0.0))

    # Test 2: KL Div = E(Unbiased approx KL Div)
    if isinstance(dist_type, CategoricalDistribution):
        dist1 = dist_type.proba_distribution(th.rand(N_ACTIONS).repeat(N_SAMPLES, 1))
        # deepcopy needed to assign new memory to new distribution instance
        dist2 = deepcopy(dist_type).proba_distribution(th.rand(N_ACTIONS).repeat(N_SAMPLES, 1))
    elif isinstance(dist_type, DiagGaussianDistribution) or isinstance(dist_type, SquashedDiagGaussianDistribution):
        mean_actions1 = th.rand(1).repeat(N_SAMPLES, 1)
        log_std1 = th.rand(1).repeat(N_SAMPLES, 1)
        mean_actions2 = th.rand(1).repeat(N_SAMPLES, 1)
        log_std2 = th.rand(1).repeat(N_SAMPLES, 1)
        dist1 = dist_type.proba_distribution(mean_actions1, log_std1)
        dist2 = deepcopy(dist_type).proba_distribution(mean_actions2, log_std2)
    elif isinstance(dist_type, BernoulliDistribution):
        dist1 = dist_type.proba_distribution(th.rand(1).repeat(N_SAMPLES, 1))
        dist2 = deepcopy(dist_type).proba_distribution(th.rand(1).repeat(N_SAMPLES, 1))
    elif isinstance(dist_type, MultiCategoricalDistribution):
        dist1 = dist_type.proba_distribution(th.rand(1, sum([N_ACTIONS, N_ACTIONS])).repeat(N_SAMPLES, 1))
        dist2 = deepcopy(dist_type).proba_distribution(th.rand(1, sum([N_ACTIONS, N_ACTIONS])).repeat(N_SAMPLES, 1))
    elif isinstance(dist_type, StateDependentNoiseDistribution):
        dist1 = StateDependentNoiseDistribution(1)
        dist2 = deepcopy(dist1)
        state = th.rand(1, N_FEATURES).repeat(N_SAMPLES, 1)
        mean_actions1 = th.rand(1).repeat(N_SAMPLES, 1)
        mean_actions2 = th.rand(1).repeat(N_SAMPLES, 1)
        _, log_std = dist1.proba_distribution_net(N_FEATURES, log_std_init=th.log(th.tensor(0.2)))
        dist1.sample_weights(log_std, batch_size=N_SAMPLES)
        dist2.sample_weights(log_std, batch_size=N_SAMPLES)
        dist1 = dist1.proba_distribution(mean_actions1, log_std, state)
        dist2 = dist2.proba_distribution(mean_actions2, log_std, state)

    full_kl_div = kl_divergence(dist1, dist2).mean(dim=0)
    actions = dist1.get_actions()
    approx_kl_div = (dist1.log_prob(actions) - dist2.log_prob(actions)).mean(dim=0)

    assert th.allclose(full_kl_div, approx_kl_div, rtol=5e-2)

    # Test 3 Sanity test with easy Bernoulli distribution
    if isinstance(dist_type, BernoulliDistribution):
        dist1 = BernoulliDistribution(1).proba_distribution(th.tensor([0.3]))
        dist2 = BernoulliDistribution(1).proba_distribution(th.tensor([0.65]))

        full_kl_div = kl_divergence(dist1, dist2)

        actions = th.tensor([0.0, 1.0])
        ad_hoc_kl = th.sum(
            th.exp(dist1.distribution.log_prob(actions))
            * (dist1.distribution.log_prob(actions) - dist2.distribution.log_prob(actions))
        )

        assert th.allclose(full_kl_div, ad_hoc_kl)
def test_categorical(dist, CAT_ACTIONS):
    # The entropy can be approximated by averaging the negative log likelihood
    # mean negative log likelihood == entropy
    set_random_seed(1)
    action_logits = th.rand(N_SAMPLES, CAT_ACTIONS)
    dist = dist.proba_distribution(action_logits)
    actions = dist.get_actions()
    entropy = dist.entropy()
    log_prob = dist.log_prob(actions)
    assert th.allclose(entropy.mean(), -log_prob.mean(), rtol=5e-3)
Exemple #8
0
def make_env(rank, sparse=False, seed=0):
    """
    Utility function for multiprocessed env.
    """
    def _init():
        env = MazeGridEnv()
        if sparse:
            env = SparseRewardWrapper(env)
        env.seed(seed + rank)
        return env

    set_random_seed(seed)
    return _init
    def set_random_seed(self, seed: Optional[int] = None) -> None:
        """
        Set the seed of the pseudo-random generators
        (python, numpy, pytorch, gym, action_space)

        :param seed:
        """
        if seed is None:
            return
        set_random_seed(seed, using_cuda=self.device.type == th.device("cuda").type)
        self.attacker_action_space.seed(seed)
        if self.env is not None:
            self.env.seed(seed)
Exemple #10
0
def run(params):
    device, use_cuda = helper.get_pytorch_device()
    sb3_utils.set_random_seed(params.seed, using_cuda=use_cuda)
    env = helper.make_env(params)

    model = network.get_model_class(params)(env).to(device)
    optimizer = optim.Adam(model.parameters(), lr=params.learning_rate)

    ep_no = 0
    total_rewards = []

    while ep_no < params.num_episodes:
        ep_no += 1
        state_vals, log_probs = [], []

        # unroll the policy
        state = env.reset()
        rewards = []
        # for each episode, only run 10_000 steps so that we don't
        # infinite loop while learning
        for t in range(10_000):
            state = torch.from_numpy(state).float().unsqueeze(0).to(device)
            probs, state_value = model(state)
            c = Categorical(probs)
            action = c.sample()
            log_probs.append(c.log_prob(action))
            state_vals.append(state_value)

            state, reward, done, info = env.step(action.item())

            rewards.append(reward)
            if done:
                break
        total_rewards.append(sum(rewards))

        reinforce_helper.log_results(ep_no, total_rewards, info, t, params)

        returns = reinforce_helper.discount_rewards(rewards, params)

        # backprop
        state_vals = torch.stack(state_vals).squeeze()
        returns = returns.to(device)
        policy_loss = (-torch.stack(log_probs).squeeze(1) *
                       (returns - state_vals).detach()).mean()
        baseline_loss = F.smooth_l1_loss(state_vals, returns, reduction='mean')

        # reset gradients
        optimizer.zero_grad()
        loss = policy_loss + params.scaling_factor * baseline_loss
        loss.backward()
        optimizer.step()
def test_sde_distribution():
    n_actions = 1
    deterministic_actions = th.ones(N_SAMPLES, n_actions) * 0.1
    state = th.ones(N_SAMPLES, N_FEATURES) * 0.3
    dist = StateDependentNoiseDistribution(n_actions, full_std=True, squash_output=False)

    set_random_seed(1)
    _, log_std = dist.proba_distribution_net(N_FEATURES)
    dist.sample_weights(log_std, batch_size=N_SAMPLES)

    dist = dist.proba_distribution(deterministic_actions, log_std, state)
    actions = dist.get_actions()

    assert th.allclose(actions.mean(), dist.distribution.mean.mean(), rtol=2e-3)
    assert th.allclose(actions.std(), dist.distribution.scale.mean(), rtol=2e-3)
def make_env(env_id, rank, seed=0):
    """
    Utility function for multiprocessed env.

    :param env_id: (str) the environment ID
    :param num_env: (int) the number of environments you wish to have in subprocesses
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    """
    def _init():
        env = gym.make(env_id)
        env.seed(seed + rank)
        return env
    set_random_seed(seed)
    return _init
def make_env(env_param, rank, seed=0):
    """
    Utility function for multiprocessed env.

    :param env_param: (dict) the environment params
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    """
    def _init():
        e = env_change_input(**env_param)
        e.seed(seed + rank)
        return e

    set_random_seed(seed)
    return _init
Exemple #14
0
    def make_env(env: gym.Env, rank: int, seed: int = 0) -> Callable:
        """
        Utility function for multiprocessed env.
        
        :param env_id: (str) the environment ID
        :param num_env: (int) the number of environment you wish to have in subprocesses
        :param seed: (int) the inital seed for RNG
        :param rank: (int) index of the subprocess
        :return: (Callable)
        """
        def _init() -> gym.Env:
            env.seed(seed + rank)
            return env

        set_random_seed(seed)
        return _init
Exemple #15
0
def make_gym_env(env_id, rank, seed=0):
    """
    Utility function for multiprocessed env.

    :param env_id: (str) the environment ID
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    """
    def _init():
        env = gym.make(env_id, reward_type="dense")
        env = gym.wrappers.FlattenObservation(env)
        env = Monitor(env)
        env.seed(seed + rank)
        return env

    set_random_seed(seed)
    return _init
Exemple #16
0
def make_training_env(env_id, options, rank, seed=0):
    """
    Utility function for multiprocessed env.

    :param env_id: (str) the environment ID
    :param options: (dict) additional arguments to pass to the specific environment class initializer
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    """
    def _init():
        register_gripper(UltrasoundProbeGripper)
        env = GymWrapper(suite.make(env_id, **options))
        env = Monitor(env)
        env.seed(seed + rank)
        return env
    set_random_seed(seed)
    return _init
Exemple #17
0
def make_env(env_id: str, rank: int, seed: int = 1, log_dir=None) -> Callable:
    '''
    Utility function for multiprocessed env.
    
    :param env_id: (str) the environment ID
    :param num_env: (int) the number of environment you wish to have in subprocesses
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    :return: (Callable)
    '''
    def _init() -> gym.Env:
        env = gym.make(env_id)
        env = Monitor(env, log_dir)
        env.seed(seed + rank)
        return env
    set_random_seed(seed)
    return _init
def test_entropy(dist):
    # The entropy can be approximated by averaging the negative log likelihood
    # mean negative log likelihood == differential entropy
    set_random_seed(1)
    state = th.rand(N_SAMPLES, N_FEATURES)
    deterministic_actions = th.rand(N_SAMPLES, N_ACTIONS)
    _, log_std = dist.proba_distribution_net(N_FEATURES, log_std_init=th.log(th.tensor(0.2)))

    if isinstance(dist, DiagGaussianDistribution):
        dist = dist.proba_distribution(deterministic_actions, log_std)
    else:
        dist.sample_weights(log_std, batch_size=N_SAMPLES)
        dist = dist.proba_distribution(deterministic_actions, log_std, state)

    actions = dist.get_actions()
    entropy = dist.entropy()
    log_prob = dist.log_prob(actions)
    assert th.allclose(entropy.mean(), -log_prob.mean(), rtol=5e-3)
Exemple #19
0
    def _init():
        set_random_seed(seed + rank)
        env = gym.make(env_id, **env_kwargs)

        # Wrap first with a monitor (e.g. for Atari env where reward clipping is used)
        log_file = os.path.join(log_dir,
                                str(rank)) if log_dir is not None else None
        # Monitor success rate too for the real robot
        info_keywords = ('is_success', ) if 'NeckEnv' in env_id else ()
        env = Monitor(env, log_file, info_keywords=info_keywords)

        # Dict observation space is currently not supported.
        # https://github.com/hill-a/stable-baselines/issues/321
        # We allow a Gym env wrapper (a subclass of gym.Wrapper)
        if wrapper_class:
            env = wrapper_class(env)

        env.seed(seed + rank)
        return env
def run(experiment: Experiment, params: argparse.Namespace):
    sb3_utils.set_random_seed(params.seed, using_cuda=use_cuda)
    env = helper.make_env(params, 'env')

    # Logs will be saved in log_dir/monitor.csv
    env = Monitor(env)

    with experiment.train():
        callback = SaveOnBestTrainingRewardCallback(experiment,
                                                    check_freq=1000)
        # Deactivate all the DQN extensions to have the original version
        # In practice, it is recommend to have them activated
        model = DQN(CnnPolicy,
                    env,
                    learning_rate=params.learning_rate,
                    gamma=params.gamma,
                    seed=params.seed,
                    max_grad_norm=params.max_grad_norm,
                    verbose=1,
                    device=device,
                    policy_kwargs={'features_extractor_class': ColoringCNN})
        model.learn(total_timesteps=params.max_ts, callback=callback)
Exemple #21
0
def make_env(env_id, rank, seed=0):
    """
    Utility function for multiprocessed env.

    :param env_id: (str) the environment ID
    :param num_env: (int) the number of environments you wish to have in subprocesses
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    """
    def _init():
        if rank == 1:
            env = gym.make("fishing-v2", C=0.2)
        elif rank == 2:
            env = gym.make("fishing-v2", C=0.1)
        elif rank == 3:
            env = gym.make("fishing-v1")
        elif rank == 0:
            env = gym.make("fishing-v1", r=0.1)
        env.seed(seed)
        return env

    set_random_seed(seed)
    return _init
def make_env(rank, seed=0):
    num_boxes = 1
    dim_room = (7, 7)
    train_mode = 'mlp'
    max_steps = 20
    """
    Utility function for multiprocessed env.

    :param env_id: (str) the environment ID
    :param num_env: (int) the number of environments you wish to have in subprocesses
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    """
    def _init():
        env = SokobanEnv(dim_room=dim_room,
                         max_steps=max_steps,
                         num_boxes=num_boxes,
                         train_mode=train_mode)
        env.seed(seed + rank)
        return env

    set_random_seed(seed)
    return _init
    if env_id not in registered_envs:
        try:
            closest_match = difflib.get_close_matches(env_id, registered_envs, n=1)[0]
        except IndexError:
            closest_match = "'no close match found...'"
        raise ValueError(
            f"{env_id} not found in gym registry, you maybe meant {closest_match}?"
        )

    # Unique id to ensure there is no race condition for the folder creation
    uuid_str = f"_{uuid.uuid4()}" if args.uuid else ""
    if args.seed < 0:
        # Seed but with a random one
        args.seed = np.random.randint(2 ** 32 - 1, dtype="int64").item()

    set_random_seed(args.seed)

    # Setting num threads to 1 makes things run faster on cpu
    if args.num_threads > 0:
        if args.verbose > 1:
            print(f"Setting torch.num_threads to {args.num_threads}")
        th.set_num_threads(args.num_threads)

    if args.trained_agent != "":
        assert args.trained_agent.endswith(".zip") and os.path.isfile(
            args.trained_agent
        ), "The trained_agent must be a valid path to a .zip file"

    print("=" * 10, env_id, "=" * 10)
    print(f"Seed: {args.seed}")
Exemple #24
0
from stable_baselines3.common.vec_env.vec_frame_stack import VecFrameStack
from stable_baselines3.common.callbacks import CheckpointCallback

import common.common as common
import common.wrapper as wrapper
import common.gym_interface as gym_interface
import common.callbacks as callbacks
from common.activation_fn import MyThreshold

if __name__ == "__main__":

    args = common.args
    print(args)

    # SAC.learn need this. If use SubprocVecEnv instead of DummyVecEnv, you need to seed in each subprocess.
    set_random_seed(common.seed)

    saved_model_filename = common.build_model_filename(args)

    hyperparams = common.load_hyperparameters(conf_name="PPO")
    print(hyperparams)

    # Make every env has the same obs space and action space
    default_wrapper = []
    # if padding zero:
    #   default_wrapper.append(wrapper.WalkerWrapper)

    if args.realign_method != "":
        default_wrapper.append(wrapper.ReAlignedWrapper)

    assert len(args.train_bodies) > 0, "No body to train."
Exemple #25
0
            closest_match = difflib.get_close_matches(env_id,
                                                      registered_envs,
                                                      n=1)[0]
        except IndexError:
            closest_match = "'no close match found...'"
        raise ValueError(
            f"{env_id} not found in gym registry, you maybe meant {closest_match}?"
        )

    # Unique id to ensure there is no race condition for the folder creation
    uuid_str = f"_{uuid.uuid4()}" if args.uuid else ""
    if args.seed < 0:
        # Seed but with a random one
        args.seed = np.random.randint(2**32 - 1, dtype="int64").item()

    set_random_seed(args.seed, True)

    # Setting num threads to 1 makes things run faster on cpu
    if args.num_threads > 0:
        if args.verbose > 1:
            print(f"Setting torch.num_threads to {args.num_threads}")
        th.set_num_threads(args.num_threads)

    if args.trained_agent != "":
        assert args.trained_agent.endswith(".zip") and os.path.isfile(
            args.trained_agent
        ), "The trained_agent must be a valid path to a .zip file"

    print("=" * 10, env_id, "=" * 10)
    print(f"Seed: {args.seed}")
Exemple #26
0
def main():  # noqa: C901
    parser = argparse.ArgumentParser()
    parser.add_argument("--env",
                        help="environment ID",
                        type=str,
                        default="CartPole-v1")
    parser.add_argument("-f",
                        "--folder",
                        help="Log folder",
                        type=str,
                        default="rl-trained-agents")
    parser.add_argument("--algo",
                        help="RL Algorithm",
                        default="ppo",
                        type=str,
                        required=False,
                        choices=list(ALGOS.keys()))
    parser.add_argument("-n",
                        "--n-timesteps",
                        help="number of timesteps",
                        default=1000,
                        type=int)
    parser.add_argument(
        "--num-threads",
        help="Number of threads for PyTorch (-1 to use default)",
        default=-1,
        type=int)
    parser.add_argument("--n-envs",
                        help="number of environments",
                        default=1,
                        type=int)
    parser.add_argument(
        "--exp-id",
        help="Experiment ID (default: 0: latest, -1: no exp folder)",
        default=0,
        type=int)
    parser.add_argument("--verbose",
                        help="Verbose mode (0: no output, 1: INFO)",
                        default=1,
                        type=int)
    parser.add_argument(
        "--no-render",
        action="store_true",
        default=False,
        help="Do not render the environment (useful for tests)")
    parser.add_argument("--deterministic",
                        action="store_true",
                        default=False,
                        help="Use deterministic actions")
    parser.add_argument(
        "--load-best",
        action="store_true",
        default=False,
        help="Load best model instead of last model if available")
    parser.add_argument(
        "--load-checkpoint",
        type=int,
        help="Load checkpoint instead of last model if available, "
        "you must pass the number of timesteps corresponding to it",
    )
    parser.add_argument("--stochastic",
                        action="store_true",
                        default=False,
                        help="Use stochastic actions")
    parser.add_argument(
        "--norm-reward",
        action="store_true",
        default=False,
        help="Normalize reward if applicable (trained with VecNormalize)")
    parser.add_argument("--seed",
                        help="Random generator seed",
                        type=int,
                        default=0)
    parser.add_argument("--reward-log",
                        help="Where to log reward",
                        default="",
                        type=str)
    parser.add_argument(
        "--gym-packages",
        type=str,
        nargs="+",
        default=[],
        help=
        "Additional external Gym environment package modules to import (e.g. gym_minigrid)",
    )
    parser.add_argument(
        "--env-kwargs",
        type=str,
        nargs="+",
        action=StoreDict,
        help="Optional keyword argument to pass to the env constructor")
    args = parser.parse_args()

    # Going through custom gym packages to let them register in the global registory
    for env_module in args.gym_packages:
        importlib.import_module(env_module)

    env_id = args.env
    algo = args.algo
    folder = args.folder

    if args.exp_id == 0:
        args.exp_id = get_latest_run_id(os.path.join(folder, algo), env_id)
        print(f"Loading latest experiment, id={args.exp_id}")

    # Sanity checks
    if args.exp_id > 0:
        log_path = os.path.join(folder, algo, f"{env_id}_{args.exp_id}")
    else:
        log_path = os.path.join(folder, algo)

    assert os.path.isdir(log_path), f"The {log_path} folder was not found"

    found = False
    for ext in ["zip"]:
        model_path = os.path.join(log_path, f"{env_id}.{ext}")
        found = os.path.isfile(model_path)
        if found:
            break

    if args.load_best:
        model_path = os.path.join(log_path, "best_model.zip")
        found = os.path.isfile(model_path)

    if args.load_checkpoint is not None:
        model_path = os.path.join(
            log_path, f"rl_model_{args.load_checkpoint}_steps.zip")
        found = os.path.isfile(model_path)

    if not found:
        raise ValueError(
            f"No model found for {algo} on {env_id}, path: {model_path}")

    off_policy_algos = ["qrdqn", "dqn", "ddpg", "sac", "her", "td3", "tqc"]

    if algo in off_policy_algos:
        args.n_envs = 1

    set_random_seed(args.seed)

    if args.num_threads > 0:
        if args.verbose > 1:
            print(f"Setting torch.num_threads to {args.num_threads}")
        th.set_num_threads(args.num_threads)

    is_atari = ExperimentManager.is_atari(env_id)

    stats_path = os.path.join(log_path, env_id)
    hyperparams, stats_path = get_saved_hyperparams(
        stats_path, norm_reward=args.norm_reward, test_mode=True)

    # load env_kwargs if existing
    env_kwargs = {}
    args_path = os.path.join(log_path, env_id, "args.yml")
    if os.path.isfile(args_path):
        with open(args_path, "r") as f:
            loaded_args = yaml.load(f, Loader=yaml.UnsafeLoader)  # pytype: disable=module-attr
            if loaded_args["env_kwargs"] is not None:
                env_kwargs = loaded_args["env_kwargs"]
    # overwrite with command line arguments
    if args.env_kwargs is not None:
        env_kwargs.update(args.env_kwargs)

    log_dir = args.reward_log if args.reward_log != "" else None

    print(env_kwargs)

    env = create_test_env(
        env_id,
        n_envs=args.n_envs,
        stats_path=stats_path,
        seed=args.seed,
        log_dir=log_dir,
        should_render=not args.no_render,
        hyperparams=hyperparams,
        env_kwargs=env_kwargs,
    )

    kwargs = dict(seed=args.seed)
    if algo in off_policy_algos:
        # Dummy buffer size as we don't need memory to enjoy the trained agent
        kwargs.update(dict(buffer_size=1))

    # Check if we are running python 3.8+
    # we need to patch saved model under python 3.6/3.7 to load them
    newer_python_version = sys.version_info.major == 3 and sys.version_info.minor >= 8

    custom_objects = {}
    if newer_python_version:
        custom_objects = {
            "learning_rate": 0.0,
            "lr_schedule": lambda _: 0.0,
            "clip_range": lambda _: 0.0,
        }

    model = ALGOS[algo].load(model_path,
                             env=env,
                             custom_objects=custom_objects,
                             **kwargs)

    obs = env.reset()

    # Deterministic by default except for atari games
    stochastic = args.stochastic or is_atari and not args.deterministic
    deterministic = not stochastic

    state = None
    episode_reward = 0.0
    episode_rewards, episode_lengths = [], []
    ep_len = 0
    # For HER, monitor success rate
    successes = []

    plt.figure(f"Enjoy {env_id}")
    plt.title(f"{env_id}", fontsize=14)

    plt.xlabel(f"Timesteps", fontsize=14)
    # plt.ylabel("Score", fontsize=14)

    observations = []
    rewards = []
    infos = []

    try:
        for _ in range(args.n_timesteps):
            action, state = model.predict(obs,
                                          state=state,
                                          deterministic=deterministic)
            obs, reward, done, info = env.step(action)
            if not args.no_render:
                env.render("human")

            episode_reward += reward[0]
            ep_len += 1

            observations.append(obs)
            rewards.append(reward)
            infos.append(info[0].get("coating"))

            if args.n_envs == 1:
                # For atari the return reward is not the atari score
                # so we have to get it from the infos dict
                if is_atari and infos is not None and args.verbose >= 1:
                    episode_infos = infos[0].get("episode")
                    if episode_infos is not None:
                        print(f"Atari Episode Score: {episode_infos['r']:.2f}")
                        print("Atari Episode Length", episode_infos["l"])

                if done and not is_atari and args.verbose > 0:
                    # NOTE: for env using VecNormalize, the mean reward
                    # is a normalized reward when `--norm_reward` flag is passed
                    print(f"Episode Reward: {episode_reward:.2f}")
                    print("Episode Length", ep_len)
                    episode_rewards.append(episode_reward)
                    episode_lengths.append(ep_len)
                    episode_reward = 0.0
                    ep_len = 0
                    state = None

                # Reset also when the goal is achieved when using HER
                if done and infos[0].get("is_success") is not None:
                    if args.verbose > 1:
                        print("Success?", infos[0].get("is_success", False))

                    if infos[0].get("is_success") is not None:
                        successes.append(infos[0].get("is_success", False))
                        episode_reward, ep_len = 0.0, 0

    except KeyboardInterrupt:
        pass

    if args.verbose > 0 and len(successes) > 0:
        print(f"Success rate: {100 * np.mean(successes):.2f}%")

    if args.verbose > 0 and len(episode_rewards) > 0:
        print(f"{len(episode_rewards)} Episodes")
        print(
            f"Mean reward: {np.mean(episode_rewards):.2f} +/- {np.std(episode_rewards):.2f}"
        )

    if args.verbose > 0 and len(episode_lengths) > 0:
        print(
            f"Mean episode length: {np.mean(episode_lengths):.2f} +/- {np.std(episode_lengths):.2f}"
        )

    env.close()

    gesamt = 0
    gesamt_mit = 0
    for el in rewards:
        if (el > 0):
            gesamt += el
        gesamt_mit += el
    print(f"Gesamt reward: {gesamt}")
    print(f"Gesamt reward mit: {gesamt_mit}")

    plt.plot(np.arange(len(observations)),
             rewards,
             label="reward",
             linewidth=1)
    plt.plot(np.arange(len(observations)),
             [obs[0][3] * 202 + 8 for obs in observations],
             label="coating_dist",
             linewidth=1)
    plt.plot(np.arange(len(observations)),
             [obs[0][1] * 202 + 8 for obs in observations],
             label="coating_targets",
             linewidth=1)
    plt.plot(np.arange(len(observations)),
             infos,
             label="coating_real",
             linewidth=1)
    plt.plot(np.arange(len(observations)),
             [obs[0][4] * 700 for obs in observations],
             label="pressure",
             linewidth=1)
    plt.legend()
    plt.show()
Exemple #27
0
def main(args=None):

    # Check if the selected environment is valid
    # If it could not be found, suggest the closest match
    registered_envs = set(gym.envs.registry.env_specs.keys())
    if args.env not in registered_envs:
        try:
            closest_match = difflib.get_close_matches(args.env,
                                                      registered_envs,
                                                      n=1)[0]
        except IndexError:
            closest_match = "'no close match found...'"
        raise ValueError(
            f"{args.env} not found in gym registry, you maybe meant {closest_match}?"
        )

    # If no specific seed is selected, choose a random one
    if args.seed < 0:
        args.seed = np.random.randint(2**32 - 1, dtype="int64").item()

    # Set the random seed across platforms
    set_random_seed(args.seed)

    # Setting num threads to 1 makes things run faster on cpu
    if args.num_threads > 0:
        if args.verbose > 1:
            print(f"Setting torch.num_threads to {args.num_threads}")
        th.set_num_threads(args.num_threads)

    # Verify that pre-trained agent exists before continuing to train it
    if args.trained_agent != "":
        assert args.trained_agent.endswith(".zip") and os.path.isfile(
            args.trained_agent
        ), "The trained_agent must be a valid path to a .zip file"

    # If enabled, ensure that the run has a unique ID
    uuid_str = f"_{uuid.uuid4()}" if args.uuid else ""

    print("=" * 10, args.env, "=" * 10)
    print(f"Seed: {args.seed}")

    exp_manager = ExperimentManager(
        args,
        args.algo,
        args.env,
        args.log_folder,
        args.tensorboard_log,
        args.n_timesteps,
        args.eval_freq,
        args.eval_episodes,
        args.save_freq,
        args.hyperparams,
        args.env_kwargs,
        args.trained_agent,
        args.optimize_hyperparameters,
        args.storage,
        args.study_name,
        args.n_trials,
        args.n_jobs,
        args.sampler,
        args.pruner,
        n_startup_trials=args.n_startup_trials,
        n_evaluations=args.n_evaluations,
        truncate_last_trajectory=args.truncate_last_trajectory,
        uuid_str=uuid_str,
        seed=args.seed,
        log_interval=args.log_interval,
        save_replay_buffer=args.save_replay_buffer,
        preload_replay_buffer=args.preload_replay_buffer,
        verbose=args.verbose,
        vec_env_type=args.vec_env,
    )

    # Prepare experiment and launch hyperparameter optimization if needed
    model = exp_manager.setup_experiment()

    if args.optimize_hyperparameters:
        exp_manager.hyperparameters_optimization()
    else:
        exp_manager.learn(model)
        exp_manager.save_trained_model(model)
Exemple #28
0
def main():  # noqa: C901
    parser = argparse.ArgumentParser()
    parser.add_argument("--env",
                        help="environment ID",
                        type=str,
                        default="Walker2DBulletEnv-v0")
    parser.add_argument("--algo",
                        help="RL Algorithm",
                        default="ppo",
                        type=str,
                        required=False,
                        choices=list(ALGOS.keys()))
    parser.add_argument("-n",
                        "--n-timesteps",
                        help="number of timesteps",
                        default=1000,
                        type=int)
    parser.add_argument(
        "--num-threads",
        help="Number of threads for PyTorch (-1 to use default)",
        default=-1,
        type=int)
    parser.add_argument("--n-envs",
                        help="number of environments",
                        default=1,
                        type=int)
    parser.add_argument(
        "--exp-id",
        help="Experiment ID (default: 0: latest, -1: no exp folder)",
        default=0,
        type=int)
    parser.add_argument("--verbose",
                        help="Verbose mode (0: no output, 1: INFO)",
                        default=1,
                        type=int)
    parser.add_argument(
        "--no-render",
        action="store_true",
        default=False,
        help="Do not render the environment (useful for tests)")
    parser.add_argument("--deterministic",
                        action="store_true",
                        default=True,
                        help="Use deterministic actions")
    parser.add_argument(
        "--load-best",
        action="store_true",
        default=False,
        help="Load best model instead of last model if available")

    parser.add_argument("--stochastic",
                        action="store_true",
                        default=False,
                        help="Use stochastic actions (for DDPG/DQN/SAC)")
    parser.add_argument(
        "--norm-reward",
        action="store_true",
        default=False,
        help="Normalize reward if applicable (trained with VecNormalize)")
    parser.add_argument("--seed",
                        help="Random generator seed",
                        type=int,
                        default=0)
    parser.add_argument("--reward-log",
                        help="Where to log reward",
                        default="",
                        type=str)
    parser.add_argument(
        "--gym-packages",
        type=str,
        nargs="+",
        default=[],
        help=
        "Additional external Gym environemnt package modules to import (e.g. gym_minigrid)",
    )
    parser.add_argument(
        "--env-kwargs",
        type=str,
        nargs="+",
        action=StoreDict,
        help="Optional keyword argument to pass to the env constructor")
    # === #
    parser.add_argument("--load-checkpoint",
                        type=str,
                        help="pass the path of zip file corresponding to it")
    parser.add_argument("-f",
                        "--folder",
                        help="Log folder",
                        type=str,
                        default="rl-trained-agents")
    parser.add_argument("--dataset", type=str, default="dataset/walker2d_v6")
    parser.add_argument("--body-id", type=int, default=0)
    args = parser.parse_args()

    dataset_name, env_id, train_files, train_params, train_names, test_files, test_params, test_names = load_dataset.load_dataset(
        args.dataset, seed=0, shuffle=False, train_proportion=1)

    # Going through custom gym packages to let them register in the global registory
    for env_module in args.gym_packages:
        importlib.import_module(env_module)

    # env_id = args.env
    algo = args.algo
    log_path = args.folder

    # if args.exp_id == 0:
    #     args.exp_id = get_latest_run_id(os.path.join(folder, algo), env_id)
    #     print(f"Loading latest experiment, id={args.exp_id}")

    # # Sanity checks
    # if args.exp_id > 0:
    #     log_path = os.path.join(folder, algo, f"{env_id}_{args.exp_id}")
    # else:
    #     log_path = os.path.join(folder, algo)

    # assert os.path.isdir(log_path), f"The {log_path} folder was not found"

    # found = False
    # for ext in ["zip"]:
    #     model_path = os.path.join(log_path, f"{env_id}.{ext}")
    #     found = os.path.isfile(model_path)
    #     if found:
    #         break

    # if args.load_best:
    #     model_path = os.path.join(log_path, "best_model.zip")
    #     found = os.path.isfile(model_path)

    # if args.load_checkpoint is not None:
    #     model_path = os.path.join(log_path, f"rl_model_{args.load_checkpoint}_steps.zip")
    #     found = os.path.isfile(model_path)

    # if not found:
    #     raise ValueError(f"No model found for {algo} on {env_id}, path: {model_path}")

    model_path = args.load_checkpoint

    if algo in ["dqn", "ddpg", "sac", "td3", "tqc"]:
        args.n_envs = 1

    set_random_seed(args.seed)

    if args.num_threads > 0:
        if args.verbose > 1:
            print(f"Setting torch.num_threads to {args.num_threads}")
        th.set_num_threads(args.num_threads)

    is_atari = "NoFrameskip" in env_id

    stats_path = os.path.join(log_path, env_id)
    hyperparams, stats_path = get_saved_hyperparams(
        stats_path, norm_reward=args.norm_reward, test_mode=True)

    # load env_kwargs if existing
    # env_kwargs = {}
    # args_path = os.path.join(log_path, env_id, "args.yml")
    # if os.path.isfile(args_path):
    #     with open(args_path, "r") as f:
    #         loaded_args = yaml.load(f, Loader=yaml.UnsafeLoader)  # pytype: disable=module-attr
    #         if loaded_args["env_kwargs"] is not None:
    #             env_kwargs = loaded_args["env_kwargs"]
    # # overwrite with command line arguments
    # if args.env_kwargs is not None:
    #     env_kwargs.update(args.env_kwargs)

    args.watch_eval = True

    env_kwargs = {
        "xml": train_files[args.body_id],
        "param": train_params[args.body_id],
        "render": args.watch_eval,
    }
    log_dir = args.reward_log if args.reward_log != "" else None

    env = create_test_env(
        env_id,
        n_envs=args.n_envs,
        stats_path=stats_path,
        seed=args.seed,
        log_dir=log_dir,
        should_render=not args.no_render,
        hyperparams=hyperparams,
        env_kwargs=env_kwargs,
    )

    kwargs = dict(seed=args.seed)
    if algo in ["dqn", "ddpg", "sac", "her", "td3", "tqc"]:
        # Dummy buffer size as we don't need memory to enjoy the trained agent
        kwargs.update(dict(buffer_size=1))

    model = ALGOS[algo].load(model_path, env=env, **kwargs)

    obs = env.reset()

    # Force deterministic for DQN, DDPG, SAC and HER (that is a wrapper around)
    deterministic = args.deterministic or algo in [
        "dqn", "ddpg", "sac", "her", "td3", "tqc"
    ] and not args.stochastic

    state = None
    episode_reward = 0.0
    episode_rewards, episode_lengths = [], []
    ep_len = 0
    # For HER, monitor success rate
    successes = []
    for _ in range(args.n_timesteps):
        action, state = model.predict(obs,
                                      state=state,
                                      deterministic=deterministic)
        # Random Agent
        # action = [env.action_space.sample()]
        # Clip Action to avoid out of bound errors
        if isinstance(env.action_space, gym.spaces.Box):
            action = np.clip(action, env.action_space.low,
                             env.action_space.high)
        obs, reward, done, infos = env.step(action)
        sleep(0.01)
        if not args.no_render:
            env.render("human")

        episode_reward += reward[0]
        ep_len += 1

        if args.n_envs == 1:
            # For atari the return reward is not the atari score
            # so we have to get it from the infos dict
            if is_atari and infos is not None and args.verbose >= 1:
                episode_infos = infos[0].get("episode")
                if episode_infos is not None:
                    print(f"Atari Episode Score: {episode_infos['r']:.2f}")
                    print("Atari Episode Length", episode_infos["l"])

            if done and not is_atari and args.verbose > 0:
                # NOTE: for env using VecNormalize, the mean reward
                # is a normalized reward when `--norm_reward` flag is passed
                print(f"Episode Reward: {episode_reward:.2f}")
                print("Episode Length", ep_len)
                episode_rewards.append(episode_reward)
                episode_lengths.append(ep_len)
                episode_reward = 0.0
                ep_len = 0
                state = None

            # Reset also when the goal is achieved when using HER
            if done and infos[0].get("is_success") is not None:
                if args.verbose > 1:
                    print("Success?", infos[0].get("is_success", False))
                # Alternatively, you can add a check to wait for the end of the episode
                if done:
                    obs = env.reset()
                if infos[0].get("is_success") is not None:
                    successes.append(infos[0].get("is_success", False))
                    episode_reward, ep_len = 0.0, 0

    if args.verbose > 0 and len(successes) > 0:
        print("Success rate: {:.2f}%".format(100 * np.mean(successes)))

    if args.verbose > 0 and len(episode_rewards) > 0:
        print("Mean reward: {:.2f} +/- {:.2f}".format(np.mean(episode_rewards),
                                                      np.std(episode_rewards)))

    if args.verbose > 0 and len(episode_lengths) > 0:
        print("Mean episode length: {:.2f} +/- {:.2f}".format(
            np.mean(episode_lengths), np.std(episode_lengths)))

    # Workaround for https://github.com/openai/gym/issues/893
    if not args.no_render:
        if args.n_envs == 1 and "Bullet" not in env_id and not is_atari and isinstance(
                env, VecEnv):
            # DummyVecEnv
            # Unwrap env
            while isinstance(env, VecEnvWrapper):
                env = env.venv
            if isinstance(env, DummyVecEnv):
                env.envs[0].env.close()
            else:
                env.close()
        else:
            # SubprocVecEnv
            env.close()
Exemple #29
0
import utils

if __name__ == "__main__":  # noqa: C901
    folder = utils.folder
    os.makedirs(folder, exist_ok=True)

    hyperparams = utils.load_hyperparameters()

    normalize_kwargs = {}
    normalize_kwargs["gamma"] = hyperparams["gamma"]
    
    args = utils.args
    
    # PPO.learn need this. If use SubprocVecEnv instead of DummyVecEnv, you need to seed in each subprocess.
    set_random_seed(utils.seed)

    debug = args.debug
    train_on_both_bodies = args.train_on_both_bodies
    with_bodyinfo = args.with_bodyinfo
    train_num_envs = 16 if not debug else 2
    total_timesteps = 5e6 if not debug else 1
    
    
    if train_on_both_bodies:
        training_bodies = args.train_bodies
        print(training_bodies)
        if with_bodyinfo:
            env = DummyVecEnv([utils.make_env(rank=i, seed=utils.seed, render=args.render, robot_body=training_bodies[i%2], body_info=training_bodies[i%2]) for i in range(train_num_envs)])
            save_filename = f"model-ant-{training_bodies[0]}-{training_bodies[1]}-with-bodyinfo"
        else:
Exemple #30
0
def main():  # noqa: C901
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--env",
        help="environment ID",
        type=str,
        default="CartPole-v1")
    parser.add_argument(
        "-f",
        "--log-folder",
        help="Log folder",
        type=str,
        default="rl-trained-agents")
    parser.add_argument(
        "--algo",
        help="RL Algorithm",
        default="ppo",
        type=str,
        required=False,
        choices=list(ALGOS.keys()))
    parser.add_argument(
        "-n",
        "--n-eval-steps",
        help="Number of evaluation timesteps",
        default=1000,
        type=int)
    parser.add_argument(
        "--num-threads",
        help="Number of threads for PyTorch (-1 to use default)",
        default=-1,
        type=int)
    parser.add_argument(
        "--n-envs",
        help="number of environments",
        default=1,
        type=int)
    parser.add_argument(
        "--exp-id",
        help="Experiment ID (default: 0: latest, -1: no exp folder)",
        default=0,
        type=int)
    parser.add_argument(
        "--verbose",
        help="Verbose mode (0: no output, 1: INFO)",
        default=1,
        type=int)
    parser.add_argument(
        '--render',
        help="1: Render environment, 0: don't render",
        type=int,
        choices=[0, 1],
        default=0)
    parser.add_argument(
        '--deterministic',
        help="1: Use deterministic actions, 0: Use stochastic actions",
        type=int,
        choices=[0, 1],
        default=0)
    parser.add_argument(
        "--load-best",
        action="store_true",
        default=False,
        help="Load best model instead of last model if available")
    parser.add_argument(
        "--load-checkpoint",
        type=int,
        help="Load checkpoint instead of last model if available, "
        "you must pass the number of timesteps corresponding to it",
    )
    parser.add_argument(
        "--stochastic",
        action="store_true",
        default=False,
        help="Use stochastic actions (for DDPG/DQN/SAC)")
    parser.add_argument(
        "--norm-reward",
        action="store_true",
        default=False,
        help="Normalize reward if applicable (trained with VecNormalize)")
    parser.add_argument(
        "--seed",
        help="Random generator seed",
        type=int,
        default=0)
    parser.add_argument(
        "--reward-log",
        help="Where to log reward",
        default="",
        type=str)
    parser.add_argument(
        "--gym-packages",
        type=str,
        nargs="+",
        default=[],
        help="Additional external Gym environemnt package modules to import (e.g. gym_minigrid)")
    parser.add_argument(
        "--env-kwargs",
        type=str,
        nargs="+",
        action=StoreDict,
        help="Optional keyword argument to pass to the env constructor")
    parser.add_argument(
        '--log-info',
        help="1: Log information at each evaluation steps and save, 0: don't log",
        type=int,
        choices=[0, 1],
        default=0)
    parser.add_argument(
        "--plot-dim",
        help="Plot end effector and goal position in real time (0: Don't plot, 2: 2D (default), 3: 3D)",
        type=int,
        default=0,
        choices=[0, 2, 3])
    args = parser.parse_args()

    #################################

    # Prepare log if needed
    if args.log_info:
        log_df = pd.DataFrame()
        log_dict = OrderedDict()

    # Prepare plot if needed
    if args.plot_dim == 2:
        fig, (ax1, ax2) = plt.subplots(2, 1, sharey=True, figsize=(5, 10))
    elif args.plot_dim == 3:
        fig = plt.figure()
        ax = fig.gca(projection='3d')

    # Going through custom gym packages to let them register 
    # in the global registry
    for env_module in args.gym_packages:
        importlib.import_module(env_module)

    env_id = args.env
    algo = args.algo
    folder = args.log_folder

    if args.exp_id == 0:
        args.exp_id = get_latest_run_id(os.path.join(folder, algo), env_id)
        print(f"Loading latest experiment, id={args.exp_id}")

    # Sanity checks
    if args.exp_id > 0:
        log_path = os.path.join(folder, algo, f"{env_id}_{args.exp_id}")
    else:
        log_path = os.path.join(folder, algo)

    assert os.path.isdir(log_path), f"The {log_path} folder was not found"

    found = False
    for ext in ["zip"]:
        model_path = os.path.join(log_path, f"{env_id}.{ext}")
        found = os.path.isfile(model_path)
        if found:
            break

    if args.load_best:
        model_path = os.path.join(log_path, "best_model.zip")
        found = os.path.isfile(model_path)

    if args.load_checkpoint is not None:
        model_path = os.path.join(
            log_path, f"rl_model_{args.load_checkpoint}_steps.zip")
        found = os.path.isfile(model_path)

    if not found:
        raise ValueError(
            f"No model found for {algo} on {env_id}, path: {model_path}")

    off_policy_algos = ["dqn", "ddpg", "sac", "her", "td3", "tqc"]

    if algo in off_policy_algos:
        args.n_envs = 1

    set_random_seed(args.seed)

    if args.num_threads > 0:
        if args.verbose > 1:
            print(f"Setting torch.num_threads to {args.num_threads}")
        th.set_num_threads(args.num_threads)

    stats_path = os.path.join(log_path, env_id)
    hyperparams, stats_path = get_saved_hyperparams(
        stats_path, norm_reward=args.norm_reward, test_mode=True)

    # load env_kwargs if existing
    env_kwargs = {}
    args_path = os.path.join(log_path, env_id, "args.yml")
    if os.path.isfile(args_path):
        with open(args_path, "r") as f:
            # pytype: disable=module-attr
            loaded_args = yaml.load(f, Loader=yaml.UnsafeLoader)
            if loaded_args["env_kwargs"] is not None:
                env_kwargs = loaded_args["env_kwargs"]
    # overwrite with command line arguments
    if args.env_kwargs is not None:
        env_kwargs.update(args.env_kwargs)

    log_dir = args.reward_log if args.reward_log != "" else None

    env = create_test_env(
        env_id,
        n_envs=args.n_envs,
        stats_path=stats_path,
        seed=args.seed,
        log_dir=log_dir,
        should_render=args.render,
        hyperparams=hyperparams,
        env_kwargs=env_kwargs,
    )

    kwargs = dict(seed=args.seed)
    if algo in off_policy_algos:
        # Dummy buffer size as we don't need memory to enjoy the trained agent
        kwargs.update(dict(buffer_size=1))

    model = ALGOS[algo].load(model_path, env=env, **kwargs)

    obs = env.reset()

    # Force deterministic for DQN, DDPG, SAC and HER (that is a wrapper around)
    deterministic = args.deterministic or algo in off_policy_algos and not args.stochastic

    state = None
    episode_reward = 0.0
    episode_rewards, episode_lengths = [], []
    ep_len = 0
    successes = []  # For HER, monitor success rate

    episode_nb = 0
    success_threshold_50 = 0.05
    success_threshold_20 = 0.02
    success_threshold_10 = 0.01
    success_threshold_5 = 0.005
    success_threshold_2 = 0.002
    success_threshold_1 = 0.001
    success_threshold_05 = 0.0005
    ep_success_list_50 = []
    ep_success_list_20 = []
    ep_success_list_10 = []
    ep_success_list_5 = []
    ep_success_list_2 = []
    ep_success_list_1 = []
    ep_success_list_05 = []
    success_list_50 = []
    success_list_20 = []
    success_list_10 = []
    success_list_5 = []
    success_list_2 = []
    success_list_1 = []
    success_list_05 = []

    # Moved render flag outside the loop (Pierre)
    if args.render:
        env.render("human")

    for t in range(args.n_eval_steps):
        action, state = model.predict(
            obs, state=state, deterministic=deterministic)
        obs, reward, done, infos = env.step(action)

        # Slow down simulation when rendering (Pierre)
        if args.render:
            if "widowx" in env_id:
                time.sleep(1. / 30.)
            else:
                env.render()

        if "widowx" in env_id:
            # Update episode success list
            ep_success_list_50 = calc_ep_success(
                success_threshold_50, ep_success_list_50, infos)
            ep_success_list_20 = calc_ep_success(
                success_threshold_20, ep_success_list_20, infos)
            ep_success_list_10 = calc_ep_success(
                success_threshold_10, ep_success_list_10, infos)
            ep_success_list_5 = calc_ep_success(
                success_threshold_5, ep_success_list_5, infos)
            ep_success_list_2 = calc_ep_success(
                success_threshold_2, ep_success_list_2, infos)
            ep_success_list_1 = calc_ep_success(
                success_threshold_1, ep_success_list_1, infos)
            ep_success_list_05 = calc_ep_success(
                success_threshold_05, ep_success_list_05, infos)

        episode_reward += reward[0]
        ep_len += 1

        # Real time plot
        if args.plot_dim == 2:

            goal = infos[0]['goal_position']
            tip = infos[0]['tip_position']

            ax1.cla()
            ax1.plot(goal[0], goal[2], marker='o', color='g',
                     linestyle='', markersize=10, label="goal", alpha=0.5)
            ax1.plot(tip[0], tip[2], marker='x', color='r',
                     linestyle='', markersize=10, label="end effector", mew=3)

            circ_1_50 = plt.Circle(
                (goal[0],
                 goal[2]),
                radius=success_threshold_50,
                edgecolor='g',
                facecolor='w',
                linestyle='--',
                label="50 mm")
            circ_1_20 = plt.Circle(
                (goal[0],
                 goal[2]),
                radius=success_threshold_20,
                edgecolor='b',
                facecolor='w',
                linestyle='--',
                label="20 mm")
            circ_1_10 = plt.Circle(
                (goal[0],
                 goal[2]),
                radius=success_threshold_10,
                edgecolor='m',
                facecolor='w',
                linestyle='--',
                label="10 mm")
            circ_1_5 = plt.Circle(
                (goal[0],
                 goal[2]),
                radius=success_threshold_5,
                edgecolor='r',
                facecolor='w',
                linestyle='--',
                label="5 mm")
            ax1.add_patch(circ_1_50)
            ax1.add_patch(circ_1_20)
            ax1.add_patch(circ_1_10)
            ax1.add_patch(circ_1_5)

            ax1.set_xlim([-0.25, 0.25])
            ax1.set_ylim([0, 0.5])
            ax1.set_xlabel("x (m)", fontsize=15)
            ax1.set_ylabel("z (m)", fontsize=15)

            ax2.cla()
            ax2.plot(goal[1], goal[2], marker='o', color='g',
                     linestyle='', markersize=10, alpha=0.5)
            ax2.plot(
                tip[1],
                tip[2],
                marker='x',
                color='r',
                linestyle='',
                markersize=10,
                mew=3)

            circ_2_50 = plt.Circle(
                (goal[1],
                 goal[2]),
                radius=success_threshold_50,
                edgecolor='g',
                facecolor='w',
                linestyle='--')
            circ_2_20 = plt.Circle(
                (goal[1],
                 goal[2]),
                radius=success_threshold_20,
                edgecolor='b',
                facecolor='w',
                linestyle='--')
            circ_2_10 = plt.Circle(
                (goal[1],
                 goal[2]),
                radius=success_threshold_10,
                edgecolor='m',
                facecolor='w',
                linestyle='--')
            circ_2_5 = plt.Circle(
                (goal[1],
                 goal[2]),
                radius=success_threshold_5,
                edgecolor='r',
                facecolor='w',
                linestyle='--')
            ax2.add_patch(circ_2_50)
            ax2.add_patch(circ_2_20)
            ax2.add_patch(circ_2_10)
            ax2.add_patch(circ_2_5)

            ax2.set_xlim([-0.25, 0.25])
            ax2.set_ylim([0, 0.5])
            ax2.set_xlabel("y (m)", fontsize=15)
            ax2.set_ylabel("z (m)", fontsize=15)

            ax1.legend(loc='upper left', bbox_to_anchor=(
                0, 1.2), ncol=3, fancybox=True, shadow=True)

            fig.suptitle("timestep " + str(ep_len) + " | distance to target: " +
                         str(round(infos[0]['new_distance'] * 1000, 1)) + " mm")
            plt.pause(0.01)
            # plt.show()

        elif args.plot_dim == 3:

            goal = infos[0]['goal_position']
            tip = infos[0]['tip_position']

            ax.cla()
            ax.plot([goal[0]], [goal[1]], zs=[goal[2]], marker='o',
                    color='g', linestyle='', markersize=10, alpha=0.5)
            ax.plot([tip[0]], [tip[1]], zs=[tip[2]], marker='x',
                    color='r', linestyle='', markersize=10, mew=3)
            ax.set_xlim([-0.2, 0.2])
            ax.set_ylim([-0.2, 0.2])
            ax.set_zlim([0, 0.5])
            ax.set_xlabel("x (m)", fontsize=15)
            ax.set_ylabel("y (m)", fontsize=15)
            ax.set_zlabel("z (m)", fontsize=15)

            fig.suptitle("timestep " + str(ep_len) + " | distance to target: " +
                         str(round(infos[0]['new_distance'] * 1000, 1)) + " mm")
            plt.pause(0.01)
            # plt.show()

        if args.log_info:

            log_dict['episode'] = episode_nb
            log_dict['timestep'] = t
            log_dict['action_1'] = action[0][0]
            log_dict['action_2'] = action[0][1]
            log_dict['action_3'] = action[0][2]
            log_dict['action_4'] = action[0][3]
            log_dict['action_5'] = action[0][4]
            log_dict['action_6'] = action[0][5]
            log_dict['old_joint_pos_1'] = infos[0]['old_joint_pos'][0]
            log_dict['old_joint_pos_2'] = infos[0]['old_joint_pos'][1]
            log_dict['old_joint_pos_3'] = infos[0]['old_joint_pos'][2]
            log_dict['old_joint_pos_4'] = infos[0]['old_joint_pos'][3]
            log_dict['old_joint_pos_5'] = infos[0]['old_joint_pos'][4]
            log_dict['old_joint_pos_6'] = infos[0]['old_joint_pos'][5]
            log_dict['new_joint_pos_1'] = infos[0]['new_joint_pos'][0]
            log_dict['new_joint_pos_2'] = infos[0]['new_joint_pos'][1]
            log_dict['new_joint_pos_3'] = infos[0]['new_joint_pos'][2]
            log_dict['new_joint_pos_4'] = infos[0]['new_joint_pos'][3]
            log_dict['new_joint_pos_5'] = infos[0]['new_joint_pos'][4]
            log_dict['new_joint_pos_6'] = infos[0]['new_joint_pos'][5]
            log_dict['joint_vel_1'] = infos[0]['joint_vel'][0]
            log_dict['joint_vel_2'] = infos[0]['joint_vel'][1]
            log_dict['joint_vel_3'] = infos[0]['joint_vel'][2]
            log_dict['joint_vel_4'] = infos[0]['joint_vel'][3]
            log_dict['joint_vel_5'] = infos[0]['joint_vel'][4]
            log_dict['joint_vel_6'] = infos[0]['joint_vel'][5]
            log_dict['joint1_min'] = -3.1
            log_dict['joint1_max'] = 3.1
            log_dict['joint2_min'] = -1.571
            log_dict['joint2_max'] = 1.571
            log_dict['joint3_min'] = -1.571
            log_dict['joint3_max'] = 1.571
            log_dict['joint4_min'] = -1.745
            log_dict['joint4_max'] = 1.745
            log_dict['joint5_min'] = -2.617
            log_dict['joint5_max'] = 2.617
            log_dict['joint6_min'] = 0.003
            log_dict['joint6_max'] = 0.03
            log_dict['action_low1'] = env.action_space.low[0]
            log_dict['action_low2'] = env.action_space.low[1]
            log_dict['action_low3'] = env.action_space.low[2]
            log_dict['action_low4'] = env.action_space.low[3]
            log_dict['action_low5'] = env.action_space.low[4]
            log_dict['action_low6'] = env.action_space.low[5]
            log_dict['action_high1'] = env.action_space.high[0]
            log_dict['action_high2'] = env.action_space.high[1]
            log_dict['action_high3'] = env.action_space.high[2]
            log_dict['action_high4'] = env.action_space.high[3]
            log_dict['action_high5'] = env.action_space.high[4]
            log_dict['action_high6'] = env.action_space.high[5]
            log_dict['reward'] = reward[0]
            log_dict['return'] = episode_reward
            log_dict['new_distance'] = infos[0]['new_distance']
            log_dict['old_distance'] = infos[0]['old_distance']
            log_dict['target_x'] = infos[0]['goal_position'][0]
            log_dict['target_y'] = infos[0]['goal_position'][1]
            log_dict['target_z'] = infos[0]['goal_position'][2]
            log_dict['tip_y'] = infos[0]['tip_position'][1]
            log_dict['tip_x'] = infos[0]['tip_position'][0]
            log_dict['tip_z'] = infos[0]['tip_position'][2]
            log_dict['done'] = done[0]
            # log_dict['obs'] = obs
            # log_dict['obs_space_low'] = env.observation_space.low
            # log_dict['obs_space_high'] = env.observation_space.high

            log_df = log_df.append(log_dict, ignore_index=True)

        if args.n_envs == 1:

            if done and args.verbose > 0:
                # NOTE: for env using VecNormalize, the mean reward
                # is a normalized reward when `--norm_reward` flag is passed
                # print(f"Episode Reward: {episode_reward:.2f}") # commented by Pierre
                # print("Episode Length", ep_len)  # commented by Pierre
                episode_rewards.append(episode_reward)
                episode_lengths.append(ep_len)
                episode_nb += 1

                if "widowx" in env_id:
                    # append the last element of the episode success list when
                    # episode is done
                    success_list_50 = calc_success_list(
                        ep_success_list_50, success_list_50)
                    success_list_20 = calc_success_list(
                        ep_success_list_20, success_list_20)
                    success_list_10 = calc_success_list(
                        ep_success_list_10, success_list_10)
                    success_list_5 = calc_success_list(
                        ep_success_list_5, success_list_5)
                    success_list_2 = calc_success_list(
                        ep_success_list_2, success_list_2)
                    success_list_1 = calc_success_list(
                        ep_success_list_1, success_list_1)
                    success_list_05 = calc_success_list(
                        ep_success_list_05, success_list_05)

                    # If the episode is successful and it starts from an
                    # unsucessful step, calculate reach time
                    reachtime_list_50 = calc_reach_time(ep_success_list_50)
                    reachtime_list_20 = calc_reach_time(ep_success_list_20)
                    reachtime_list_10 = calc_reach_time(ep_success_list_10)
                    reachtime_list_5 = calc_reach_time(ep_success_list_5)
                    reachtime_list_2 = calc_reach_time(ep_success_list_2)
                    reachtime_list_1 = calc_reach_time(ep_success_list_1)
                    reachtime_list_05 = calc_reach_time(ep_success_list_05)

                if args.log_info:
                    log_df = log_df[log_dict.keys()]  # sort columns

                    # add estimated tip velocity and acceleration (according to
                    # the documentation, 1 timestep = 240 Hz)
                    log_df['est_vel'] = log_df['new_distance'].diff() * 240
                    log_df['est_vel'].loc[0] = 0    # initial velocity is 0
                    log_df['est_acc'] = log_df['est_vel'].diff() * 240
                    log_df['est_acc'].loc[0] = 0    # initial acceleration is 0

                    log_df.to_csv(
                        log_path +
                        "/res_episode_" +
                        str(episode_nb) +
                        ".csv",
                        index=False)  # slow
                    # log_df.to_pickle(log_path+"/res_episode_"+str(episode)+".pkl")
                    # # fast

                # Reset for the new episode
                episode_reward = 0.0
                ep_len = 0
                state = None
                ep_success_list_50 = []
                ep_success_list_20 = []
                ep_success_list_10 = []
                ep_success_list_5 = []
                ep_success_list_2 = []
                ep_success_list_1 = []
                ep_success_list_05 = []

            # Reset also when the goal is achieved when using HER
            if done and infos[0].get("is_success") is not None:
                if args.verbose > 1:
                    print("Success?", infos[0].get("is_success", False))
                # Alternatively, you can add a check to wait for the end of the
                # episode
                if done:
                    obs = env.reset()
                if infos[0].get("is_success") is not None:
                    successes.append(infos[0].get("is_success", False))
                    episode_reward, ep_len = 0.0, 0

    if args.verbose > 0 and len(successes) > 0:
        print(f"Success rate: {100 * np.mean(successes):.2f}%")

    if args.verbose > 0 and len(episode_lengths) > 0:
        print(
            f"Mean episode length: {np.mean(episode_lengths):.2f} +/- {np.std(episode_lengths):.2f}")

    if args.verbose > 0 and len(episode_rewards) > 0:
        print(
            f"Mean reward: {np.mean(episode_rewards):.2f} +/- {np.std(episode_rewards):.2f}")

        if "widowx" in env_id:
            SR_mean_50, RT_mean_50 = calc_mean_successratio_reachtime(
                success_threshold_50, success_list_50, reachtime_list_50)
            SR_mean_20, RT_mean_20 = calc_mean_successratio_reachtime(
                success_threshold_20, success_list_20, reachtime_list_20)
            SR_mean_10, RT_mean_10 = calc_mean_successratio_reachtime(
                success_threshold_10, success_list_10, reachtime_list_10)
            SR_mean_5, RT_mean_5 = calc_mean_successratio_reachtime(
                success_threshold_5, success_list_5, reachtime_list_5)
            SR_mean_2, RT_mean_2 = calc_mean_successratio_reachtime(
                success_threshold_2, success_list_2, reachtime_list_2)
            SR_mean_1, RT_mean_1 = calc_mean_successratio_reachtime(
                success_threshold_1, success_list_1, reachtime_list_1)
            SR_mean_05, RT_mean_05 = calc_mean_successratio_reachtime(
                success_threshold_05, success_list_05, reachtime_list_05)

            # log metrics to stats.csv
            d = {
                "Eval mean reward": np.mean(episode_rewards),
                "Eval std": np.std(episode_rewards),
                "success ratio 50mm": SR_mean_50,
                "Average reach time 50mm": RT_mean_50,
                "success ratio 20mm": SR_mean_20,
                "Average reach time 20mm": RT_mean_20,
                "success ratio 10mm": SR_mean_10,
                "Average reach time 10mm": RT_mean_10,
                "success ratio 5mm": SR_mean_5,
                "Average reach time 5mm": RT_mean_5,
                "success ratio 2mm": SR_mean_2,
                "Average reach time 2mm": RT_mean_2,
                "success ratio 1mm": SR_mean_1,
                "Average reach time 1mm": RT_mean_1,
                "success ratio 0.5mm": SR_mean_05,
                "Average reach time 0.5mm": RT_mean_05
            }

            # print("path:", log_path)
            df = pd.DataFrame(d, index=[0])
            df.to_csv(log_path + "/stats.csv", index=False)

    # Workaround for https://github.com/openai/gym/issues/893
    if args.render:
        if args.n_envs == 1 and "Bullet" not in env_id and isinstance(
                env, VecEnv):
            # DummyVecEnv
            # Unwrap env
            while isinstance(env, VecEnvWrapper):
                env = env.venv
            if isinstance(env, DummyVecEnv):
                env.envs[0].env.close()
            else:
                env.close()
        else:
            # SubprocVecEnv
            env.close()