Beispiel #1
0
def enjoy(stats_path,
          model_path,
          dataset,
          body_id,
          algo,
          n_timesteps=200,
          test_time=3,
          render=False,
          seed=0):
    dataset_name, env_id, train_files, train_params, train_names, test_files, test_params, test_names = load_dataset.load_dataset(
        dataset, seed=0, shuffle=False, train_proportion=1)
    set_random_seed(seed * 128 + 127)
    hyperparams, stats_path = get_saved_hyperparams(stats_path,
                                                    norm_reward=False,
                                                    test_mode=True)
    env_kwargs = {
        "xml": train_files[body_id],
        "param": train_params[body_id],
        "max_episode_steps": n_timesteps + 1,
        "render": render,
    }
    env = create_test_env(
        env_id,
        n_envs=1,
        stats_path=stats_path,
        seed=seed,
        log_dir="tmp/",
        should_render=False,
        hyperparams=hyperparams,
        env_kwargs=env_kwargs,
    )
    kwargs = dict(seed=seed)
    model = ALGOS[algo].load(model_path, env=env, **kwargs)
    obs = env.reset()
    state = None
    episode_reward = 0.0
    episode_rewards, episode_lengths = [], []
    ep_len = 0

    body_x_record = []
    for _run in range(test_time):
        body_x = 0
        for _step in range(n_timesteps):
            action, state = model.predict(obs, state=state, deterministic=True)
            if isinstance(env.action_space, gym.spaces.Box):
                action = np.clip(action, env.action_space.low,
                                 env.action_space.high)
            body_x = env.envs[0].robot.body_xyz[0]
            obs, reward, done, infos = env.step(action)
            episode_reward += reward[0]
            ep_len += 1
            if render:
                sleep(0.01)
            if done:
                break
        body_x_record.append(body_x)
        obs = env.reset()
    body_x_record = np.array(body_x_record)
    env.close()
    return body_x_record
Beispiel #2
0
def get_scores(args, folder, policy_file, env_name, algo, stats_path, hyperparams, n_evals):
    """
    """
    env = create_test_env(
        env_name,
        n_envs=10,
        stats_path=stats_path,
        seed=args.seed,
        log_dir=folder + "/../Logs",
        should_render=not args.no_render,
        hyperparams=hyperparams,
        env_kwargs={},
    )
    fields = policy_file.split('.')


    #custom_objects = {}
    # This part is useful if the generated files come from a more recent version of python
    # newer_python_version = sys.version_info.major == 3 and sys.version_info.minor >= 8
    # if newer_python_version:
    custom_objects = {
            "learning_rate": 0.0,
            "lr_schedule": lambda _: 0.0,
            "exploration_schedule": lambda _: 0.0,
            "clip_range": lambda _: 0.0,
        }

    Space.__setstate__ = patch_setstate
    model = ALGOS[algo].load(folder + "/" + fields[0], custom_objects=custom_objects)
    policy = model.policy
    episode_rewards, _ = evaluate_policy(policy, env, n_eval_episodes=n_evals, return_episode_rewards=True)
    scores = np.array(episode_rewards)
    return scores
Beispiel #3
0
    def __init__(self,
                 env_id,
                 video_folder,
                 hyperparams,
                 params_path,
                 video_length,
                 deterministic=True,
                 name_prefix='video',
                 interval=10000,
                 env_params={},
                 seed=0):
        """
        :param env_id: environment id
        :param video_folder: (str) Where to save videos
        :param video_length: (int)  Length of recorded videos
        :param name_prefix: (str) Prefix to the video name
        :param interval:
        """
        #ToDo: finish the doc string

        self.best_mean_reward = -np.inf
        self.n_steps = 0
        self.video_folder = video_folder
        self.interval = interval
        self.video_length = video_length
        self.deterministic = deterministic
        self.is_atari = 'NoFrameskip' in env_id
        self.env_id = env_id

        test_path = os.path.join(video_folder, 'video')

        env = create_test_env(env_id,
                              n_envs=1,
                              stats_path=params_path,
                              seed=seed,
                              hyperparams=hyperparams,
                              env_params=env_params)
        env.reset()

        self.env = VecVideoRecorder(env,
                                    test_path,
                                    record_video_trigger=lambda x: x == 0,
                                    video_length=video_length,
                                    name_prefix=name_prefix)
Beispiel #4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env',
                        help='environment ID',
                        type=str,
                        default='CartPole-v1')
    parser.add_argument('-f',
                        '--folder',
                        help='Log folder',
                        type=str,
                        default='trained_agents')
    parser.add_argument('--algo',
                        help='RL Algorithm',
                        default='ppo2',
                        type=str,
                        required=False,
                        choices=list(ALGOS.keys()))
    parser.add_argument('-n',
                        '--n-timesteps',
                        help='number of timesteps',
                        default=1000,
                        type=int)
    parser.add_argument('--n-envs',
                        help='number of environments',
                        default=1,
                        type=int)
    parser.add_argument(
        '--exp-id',
        help='Experiment ID (default: -1, no exp folder, 0: latest)',
        default=-1,
        type=int)
    parser.add_argument('--verbose',
                        help='Verbose mode (0: no output, 1: INFO)',
                        default=1,
                        type=int)
    parser.add_argument(
        '--no-render',
        action='store_true',
        default=False,
        help='Do not render the environment (useful for tests)')
    parser.add_argument('--deterministic',
                        action='store_true',
                        default=False,
                        help='Use deterministic actions')
    parser.add_argument('--stochastic',
                        action='store_true',
                        default=False,
                        help='Use stochastic actions (for DDPG/DQN/SAC)')
    parser.add_argument(
        '--norm-reward',
        action='store_true',
        default=False,
        help='Normalize reward if applicable (trained with VecNormalize)')
    parser.add_argument('--seed',
                        help='Random generator seed',
                        type=int,
                        default=0)
    parser.add_argument('--reward-log',
                        help='Where to log reward',
                        default='',
                        type=str)
    parser.add_argument(
        '--gym-packages',
        type=str,
        nargs='+',
        default=[],
        help=
        'Additional external Gym environemnt package modules to import (e.g. gym_minigrid)'
    )
    args = parser.parse_args()

    # Going through custom gym packages to let them register in the global registory
    for env_module in args.gym_packages:
        importlib.import_module(env_module)

    env_id = args.env
    algo = args.algo
    folder = args.folder

    if args.exp_id == 0:
        args.exp_id = get_latest_run_id(os.path.join(folder, algo), env_id)
        print('Loading latest experiment, id={}'.format(args.exp_id))

    # Sanity checks
    if args.exp_id > 0:
        log_path = os.path.join(folder, algo,
                                '{}_{}'.format(env_id, args.exp_id))
    else:
        log_path = os.path.join(folder, algo)

    assert os.path.isdir(log_path), "The {} folder was not found".format(
        log_path)

    model_path = find_saved_model(algo, log_path, env_id)

    if algo in ['dqn', 'ddpg', 'sac', 'td3']:
        args.n_envs = 1

    set_global_seeds(args.seed)

    is_atari = 'NoFrameskip' in env_id

    stats_path = os.path.join(log_path, env_id)
    hyperparams, stats_path = get_saved_hyperparams(
        stats_path, norm_reward=args.norm_reward, test_mode=True)

    log_dir = args.reward_log if args.reward_log != '' else None

    env = create_test_env(env_id,
                          n_envs=args.n_envs,
                          is_atari=is_atari,
                          stats_path=stats_path,
                          seed=args.seed,
                          log_dir=log_dir,
                          should_render=not args.no_render,
                          hyperparams=hyperparams)

    # ACER raises errors because the environment passed must have
    # the same number of environments as the model was trained on.
    load_env = None if algo == 'acer' else env
    model = ALGOS[algo].load(model_path, env=load_env)

    obs = env.reset()

    # Force deterministic for DQN, DDPG, SAC and HER (that is a wrapper around)
    deterministic = args.deterministic or algo in [
        'dqn', 'ddpg', 'sac', 'her', 'td3'
    ] and not args.stochastic

    episode_reward = 0.0
    episode_rewards = []
    ep_len = 0
    # For HER, monitor success rate
    successes = []
    for _ in range(args.n_timesteps):
        action, _ = model.predict(obs, deterministic=deterministic)
        # Random Agent
        # action = [env.action_space.sample()]
        # Clip Action to avoid out of bound errors
        if isinstance(env.action_space, gym.spaces.Box):
            action = np.clip(action, env.action_space.low,
                             env.action_space.high)
        obs, reward, done, infos = env.step(action)
        if not args.no_render:
            env.render('human')

        episode_reward += reward[0]
        ep_len += 1

        if args.n_envs == 1:
            # For atari the return reward is not the atari score
            # so we have to get it from the infos dict
            if is_atari and infos is not None and args.verbose >= 1:
                episode_infos = infos[0].get('episode')
                if episode_infos is not None:
                    print("Atari Episode Score: {:.2f}".format(
                        episode_infos['r']))
                    print("Atari Episode Length", episode_infos['l'])

            if done and not is_atari and args.verbose > 0:
                # NOTE: for env using VecNormalize, the mean reward
                # is a normalized reward when `--norm_reward` flag is passed
                print("Episode Reward: {:.2f}".format(episode_reward))
                print("Episode Length", ep_len)
                episode_rewards.append(episode_reward)
                episode_reward = 0.0
                ep_len = 0

            # Reset also when the goal is achieved when using HER
            if done or infos[0].get('is_success', False):
                if args.algo == 'her' and args.verbose > 1:
                    print("Success?", infos[0].get('is_success', False))
                # Alternatively, you can add a check to wait for the end of the episode
                # if done:
                obs = env.reset()
                if args.algo == 'her':
                    successes.append(infos[0].get('is_success', False))
                    episode_reward, ep_len = 0.0, 0

    if args.verbose > 0 and len(successes) > 0:
        print("Success rate: {:.2f}%".format(100 * np.mean(successes)))

    if args.verbose > 0 and len(episode_rewards) > 0:
        print("Mean reward: {:.2f}".format(np.mean(episode_rewards)))

    # Workaround for https://github.com/openai/gym/issues/893
    if not args.no_render:
        if args.n_envs == 1 and 'Bullet' not in env_id and not is_atari and isinstance(
                env, VecEnv):
            # DummyVecEnv
            # Unwrap env
            while isinstance(env, VecNormalize) or isinstance(
                    env, VecFrameStack):
                env = env.venv
            env.envs[0].env.close()
        else:
            # SubprocVecEnv
            env.close()
def main():
    seed = 0
    num_samples = 20
    parser = argparse.ArgumentParser()
    parser.add_argument('--env',
                        help='environment ID',
                        type=str,
                        default='CartPole-v1')
    parser.add_argument('-f',
                        '--folder',
                        help='Log folder',
                        type=str,
                        default='rl-baselines-zoo/trained_agents')
    parser.add_argument('--algo',
                        help='RL Algorithm',
                        default='dqn',
                        type=str,
                        required=False,
                        choices=list(ALGOS.keys()))
    parser.add_argument('-n',
                        '--n-timesteps',
                        help='number of timesteps',
                        default=2000,
                        type=int)
    parser.add_argument('--n-envs',
                        help='number of environments',
                        default=1,
                        type=int)
    parser.add_argument(
        '--exp-id',
        help='Experiment ID (default: -1, no exp folder, 0: latest)',
        default=-1,
        type=int)
    parser.add_argument('--verbose',
                        help='Verbose mode (0: no output, 1: INFO)',
                        default=1,
                        type=int)
    parser.add_argument(
        '--no-render',
        action='store_true',
        default=False,
        help='Do not render the environment (useful for tests)')
    parser.add_argument('--deterministic',
                        action='store_true',
                        default=False,
                        help='Use deterministic actions')
    parser.add_argument('--stochastic',
                        action='store_true',
                        default=False,
                        help='Use stochastic actions (for DDPG/DQN/SAC)')
    parser.add_argument(
        '--load-best',
        action='store_true',
        default=False,
        help='Load best model instead of last model if available')
    parser.add_argument(
        '--norm-reward',
        action='store_true',
        default=False,
        help='Normalize reward if applicable (trained with VecNormalize)')
    args = parser.parse_args()

    env_id = args.env
    algo = args.algo
    folder = args.folder

    if args.exp_id == 0:
        args.exp_id = get_latest_run_id(os.path.join(folder, algo), env_id)
        print('Loading latest experiment, id={}'.format(args.exp_id))

    # Sanity checks
    if args.exp_id > 0:
        log_path = os.path.join(folder, algo,
                                '{}_{}'.format(env_id, args.exp_id))
    else:
        log_path = os.path.join(folder, algo)

    assert os.path.isdir(log_path), "The {} folder was not found".format(
        log_path)

    model_path = find_saved_model(algo,
                                  log_path,
                                  env_id,
                                  load_best=args.load_best)

    if algo in ['dqn', 'ddpg', 'sac', 'td3']:
        args.n_envs = 1

    set_global_seeds(seed)

    is_atari = 'NoFrameskip' in env_id

    stats_path = os.path.join(log_path, env_id)
    hyperparams, stats_path = get_saved_hyperparams(
        stats_path, norm_reward=args.norm_reward, test_mode=True)

    log_dir = None

    env_kwargs = {}

    env = create_test_env(env_id,
                          n_envs=args.n_envs,
                          is_atari=is_atari,
                          stats_path=stats_path,
                          seed=seed,
                          log_dir=log_dir,
                          should_render=not args.no_render,
                          hyperparams=hyperparams,
                          env_kwargs=env_kwargs)

    # ACER raises errors because the environment passed must have
    # the same number of environments as the model was trained on.
    load_env = None if algo == 'acer' else env
    model = ALGOS[algo].load(model_path, env=load_env)

    env = gym.make('CartPole-v1')
    obs = env.reset()

    # Force deterministic for DQN, DDPG, SAC and HER (that is a wrapper around)
    deterministic = args.deterministic or algo in [
        'dqn', 'ddpg', 'sac', 'her', 'td3'
    ] and not args.stochastic

    episode_reward = 0.0
    episode_rewards, episode_lengths = [], []
    ep_len = 0
    # For HER, monitor success rate
    successes = []
    state = None

    embedder = indicator_feature
    halfspaces = {}

    for i in range(num_samples):
        print("+" * 10)
        #sample random state to start in

        #TODO: maybe reset with random actions? How to make it realistic? Does it matter. Let's just try random for now to test weird edge cases.
        obs = env.reset(uniform=True)  #sample more uniformly than typical
        print("start state", obs)
        # input()
        #obs = env.reset_state(env.observation_space.sample())

        #rollout once for each action and compute feature counts
        start_state = obs.copy()

        fcount_vectors = []
        init_actions = []
        ##rollout code:
        for init_action in range(env.action_space.n):
            print("ACTION", init_action)
            obs = env.reset(start_state=start_state)
            print("init state", obs)
            env.render()
            # input()
            ep_ret = 0
            fcounts = embedder(start_state)
            #do initial action
            obs, r, done, info = env.step(init_action)  # take a random action

            fcounts += embedder(obs)  #TODO: discount??
            ep_ret += r
            #print(r, obs)
            if done:
                print("final state", obs)
                print("return", ep_ret)
                print("fcounts", fcounts)
                fcount_vectors.append(fcounts)
                init_actions.append(init_action)
                continue

            #run tester policy thereafter
            while True:

                #env.render()

                #TODO: sample within allowable range of angle and position
                action, state = model.predict(obs,
                                              state=state,
                                              deterministic=deterministic)
                # Random Agent
                # action = [env.action_space.sample()]
                # Clip Action to avoid out of bound errors
                if isinstance(env.action_space, gym.spaces.Box):
                    action = np.clip(action, env.action_space.low,
                                     env.action_space.high)
                #a = env.action_space.sample()
                #print(obs, action)
                obs, r, done, info = env.step(action)  # take a random action

                fcounts += embedder(obs)
                #print(obs)
                #print(done)
                ep_ret += r
                #print(r, obs)
                if done:
                    print("final state", obs)
                    print("return", ep_ret)
                    print("fcounts", fcounts)
                    fcount_vectors.append(fcounts)
                    init_actions.append(init_action)
                    break

        print("action {} over {} => fcount diff = {}".format(
            init_actions[0], init_actions[1],
            fcount_vectors[0] - fcount_vectors[1]))
        halfspaces[state, init_actions[0],
                   init_actions[1]] = fcount_vectors[0] - fcount_vectors[1]
        # input()
        #TODO: put this inside one of the value alignment verification classes to get sa_fcount_diffs and hopefully reuse that code
        #then visualize test cases

        # input()
    # for _ in range(args.n_timesteps):
    #     action, state = model.predict(obs, state=state, deterministic=deterministic)
    #     # Random Agent
    #     # action = [env.action_space.sample()]
    #     # Clip Action to avoid out of bound errors
    #     if isinstance(env.action_space, gym.spaces.Box):
    #         action = np.clip(action, env.action_space.low, env.action_space.high)
    #     obs, reward, done, infos = env.step(action)
    #     if not args.no_render:
    #         env.render('human')

    #     episode_reward += reward
    #     ep_len += 1

    #     if args.n_envs == 1:
    #         # For atari the return reward is not the atari score
    #         # so we have to get it from the infos dict
    #         if is_atari and infos is not None and args.verbose >= 1:
    #             episode_infos = infos.get('episode')
    #             if episode_infos is not None:
    #                 print("Atari Episode Score: {:.2f}".format(episode_infos['r']))
    #                 print("Atari Episode Length", episode_infos['l'])

    #         if done and not is_atari and args.verbose > 0:
    #             # NOTE: for env using VecNormalize, the mean reward
    #             # is a normalized reward when `--norm_reward` flag is passed
    #             print("Episode Reward: {:.2f}".format(episode_reward))
    #             print("Episode Length", ep_len)
    #             state = None
    #             episode_rewards.append(episode_reward)
    #             episode_lengths.append(ep_len)
    #             episode_reward = 0.0
    #             ep_len = 0

    #         # Reset also when the goal is achieved when using HER
    #         if done or infos.get('is_success', False):
    #             if args.algo == 'her' and args.verbose > 1:
    #                 print("Success?", infos[0].get('is_success', False))
    #             # Alternatively, you can add a check to wait for the end of the episode
    #             # if done:
    #             obs = env.reset()
    #             if args.algo == 'her':
    #                 successes.append(infos[0].get('is_success', False))
    #                 episode_reward, ep_len = 0.0, 0

    # if args.verbose > 0 and len(successes) > 0:
    #     print("Success rate: {:.2f}%".format(100 * np.mean(successes)))

    # if args.verbose > 0 and len(episode_rewards) > 0:
    #     print("Mean reward: {:.2f} +/- {:.2f}".format(np.mean(episode_rewards), np.std(episode_rewards)))

    # if args.verbose > 0 and len(episode_lengths) > 0:
    #     print("Mean episode length: {:.2f} +/- {:.2f}".format(np.mean(episode_lengths), np.std(episode_lengths)))

    # Workaround for https://github.com/openai/gym/issues/893
    if not args.no_render:
        if args.n_envs == 1 and 'Bullet' not in env_id and not is_atari and isinstance(
                env, VecEnv):
            # DummyVecEnv
            # Unwrap env
            while isinstance(env, VecNormalize) or isinstance(
                    env, VecFrameStack):
                env = env.venv
            env.envs[0].env.close()
        else:
            # SubprocVecEnv
            env.close()
Beispiel #6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env',
                        help='environment ID',
                        type=str,
                        default='CartPole-v1')
    parser.add_argument('-f',
                        '--folder',
                        help='Log folder',
                        type=str,
                        default='trained_agents')
    parser.add_argument('--algo',
                        help='RL Algorithm',
                        default='ppo2',
                        type=str,
                        required=False,
                        choices=list(ALGOS.keys()))
    parser.add_argument('-n',
                        '--n-timesteps',
                        help='number of timesteps',
                        default=1000,
                        type=int)
    parser.add_argument('--n-envs',
                        help='number of environments',
                        default=1,
                        type=int)
    parser.add_argument('--verbose',
                        help='Verbose mode (0: no output, 1: INFO)',
                        default=1,
                        type=int)
    parser.add_argument(
        '--no-render',
        action='store_true',
        default=False,
        help='Do not render the environment (useful for tests)')
    parser.add_argument('--deterministic',
                        action='store_true',
                        default=False,
                        help='Use deterministic actions')
    parser.add_argument(
        '--norm-reward',
        action='store_true',
        default=False,
        help='Normalize reward if applicable (trained with VecNormalize)')
    parser.add_argument('--seed',
                        help='Random generator seed',
                        type=int,
                        default=0)
    parser.add_argument('--reward-log',
                        help='Where to log reward',
                        default='',
                        type=str)

    args = parser.parse_args()

    env_id = args.env
    algo = args.algo
    folder = args.folder
    model_path = "{}/{}/{}.pkl".format(folder, algo, env_id)

    # Sanity checks
    assert os.path.isdir(folder + '/' +
                         algo), "The {}/{}/ folder was not found".format(
                             folder, algo)
    assert os.path.isfile(
        model_path), "No model found for {} on {}, path: {}".format(
            algo, env_id, model_path)

    if algo in ['dqn', 'ddpg']:
        args.n_envs = 1

    set_global_seeds(args.seed)

    is_atari = 'NoFrameskip' in env_id

    stats_path = "{}/{}/{}/".format(folder, algo, env_id)
    if not os.path.isdir(stats_path):
        stats_path = None

    log_dir = args.reward_log if args.reward_log != '' else None

    env = create_test_env(env_id,
                          n_envs=args.n_envs,
                          is_atari=is_atari,
                          stats_path=stats_path,
                          norm_reward=args.norm_reward,
                          seed=args.seed,
                          log_dir=log_dir,
                          should_render=not args.no_render)

    model = ALGOS[algo].load(model_path)

    obs = env.reset()

    # Force deterministic for DQN and DDPG
    deterministic = args.deterministic or algo in ['dqn', 'ddpg']

    running_reward = 0.0
    ep_len = 0
    for _ in range(args.n_timesteps):
        action, _ = model.predict(obs, deterministic=deterministic)
        # Random Agent
        # action = [env.action_space.sample()]
        # Clip Action to avoid out of bound errors
        if isinstance(env.action_space, gym.spaces.Box):
            action = np.clip(action, env.action_space.low,
                             env.action_space.high)
        obs, reward, done, infos = env.step(action)
        if not args.no_render:
            env.render('human')
        running_reward += reward[0]
        ep_len += 1

        if args.n_envs == 1:
            # For atari the return reward is not the atari score
            # so we have to get it from the infos dict
            if is_atari and infos is not None and args.verbose >= 1:
                episode_infos = infos[0].get('episode')
                if episode_infos is not None:
                    print("Atari Episode Score: {:.2f}".format(
                        episode_infos['r']))
                    print("Atari Episode Length", episode_infos['l'])

            if done and not is_atari and args.verbose >= 1:
                # NOTE: for env using VecNormalize, the mean reward
                # is a normalized reward when `--norm_reward` flag is passed
                print("Episode Reward: {:.2f}".format(running_reward))
                print("Episode Length", ep_len)
                running_reward = 0.0
                ep_len = 0

    # Workaround for https://github.com/openai/gym/issues/893
    if not args.no_render:
        if args.n_envs == 1 and not 'Bullet' in env_id and not is_atari:
            # DummyVecEnv
            # Unwrap env
            while isinstance(env, VecNormalize) or isinstance(
                    env, VecFrameStack):
                env = env.venv
            env.envs[0].env.close()
        else:
            # SubprocVecEnv
            env.close()
Beispiel #7
0
        normalize = hyperparams['normalize']
        if isinstance(normalize, str):
            normalize_kwargs = eval(normalize)
            normalize = True
        del hyperparams['normalize']

    # create the test env
    env_hyperparams = {
        'normalize': normalize,
        'n_stack': n_stack,
        'normalize_kwargs': normalize_kwargs
    }
    env = create_test_env(env_id,
                          n_envs=args.n_envs,
                          stats_path=params_path,
                          seed=seed,
                          hyperparams=env_hyperparams,
                          env_params=env_params,
                          should_render=False)

    # load the pretrained agent
    print("Loading pretrained agent")
    model = ALGOS[algo].load(trained_agent_path, env=env, verbose=1)

    # test the pretrained agent
    print("=" * 10, "TESTING", env_id, "=" * 10)
    if len(env_params):
        print("environment parameters")
        pprint(env_params)

    ep_rewards = evaluate_model(model,
Beispiel #8
0
def main():  # noqa: C901
    parser = argparse.ArgumentParser()
    parser.add_argument("--env",
                        help="environment ID",
                        type=str,
                        default="CartPole-v1")
    parser.add_argument("-f",
                        "--folder",
                        help="Log folder",
                        type=str,
                        default="rl-trained-agents")
    parser.add_argument("--algo",
                        help="RL Algorithm",
                        default="ppo",
                        type=str,
                        required=False,
                        choices=list(ALGOS.keys()))
    parser.add_argument("-n",
                        "--n-timesteps",
                        help="number of timesteps",
                        default=1000,
                        type=int)
    parser.add_argument(
        "--num-threads",
        help="Number of threads for PyTorch (-1 to use default)",
        default=-1,
        type=int)
    parser.add_argument("--n-envs",
                        help="number of environments",
                        default=1,
                        type=int)
    parser.add_argument(
        "--exp-id",
        help="Experiment ID (default: 0: latest, -1: no exp folder)",
        default=0,
        type=int)
    parser.add_argument("--verbose",
                        help="Verbose mode (0: no output, 1: INFO)",
                        default=1,
                        type=int)
    parser.add_argument(
        "--no-render",
        action="store_true",
        default=False,
        help="Do not render the environment (useful for tests)")
    parser.add_argument("--deterministic",
                        action="store_true",
                        default=False,
                        help="Use deterministic actions")
    parser.add_argument(
        "--load-best",
        action="store_true",
        default=False,
        help="Load best model instead of last model if available")
    parser.add_argument(
        "--load-checkpoint",
        type=int,
        help="Load checkpoint instead of last model if available, "
        "you must pass the number of timesteps corresponding to it",
    )
    parser.add_argument("--stochastic",
                        action="store_true",
                        default=False,
                        help="Use stochastic actions")
    parser.add_argument(
        "--norm-reward",
        action="store_true",
        default=False,
        help="Normalize reward if applicable (trained with VecNormalize)")
    parser.add_argument("--seed",
                        help="Random generator seed",
                        type=int,
                        default=0)
    parser.add_argument("--reward-log",
                        help="Where to log reward",
                        default="",
                        type=str)
    parser.add_argument(
        "--gym-packages",
        type=str,
        nargs="+",
        default=[],
        help=
        "Additional external Gym environment package modules to import (e.g. gym_minigrid)",
    )
    parser.add_argument(
        "--env-kwargs",
        type=str,
        nargs="+",
        action=StoreDict,
        help="Optional keyword argument to pass to the env constructor")
    args = parser.parse_args()

    # Going through custom gym packages to let them register in the global registory
    for env_module in args.gym_packages:
        importlib.import_module(env_module)

    env_id = args.env
    algo = args.algo
    folder = args.folder

    if args.exp_id == 0:
        args.exp_id = get_latest_run_id(os.path.join(folder, algo), env_id)
        print(f"Loading latest experiment, id={args.exp_id}")

    # Sanity checks
    if args.exp_id > 0:
        log_path = os.path.join(folder, algo, f"{env_id}_{args.exp_id}")
    else:
        log_path = os.path.join(folder, algo)

    assert os.path.isdir(log_path), f"The {log_path} folder was not found"

    found = False
    for ext in ["zip"]:
        model_path = os.path.join(log_path, f"{env_id}.{ext}")
        found = os.path.isfile(model_path)
        if found:
            break

    if args.load_best:
        model_path = os.path.join(log_path, "best_model.zip")
        found = os.path.isfile(model_path)

    if args.load_checkpoint is not None:
        model_path = os.path.join(
            log_path, f"rl_model_{args.load_checkpoint}_steps.zip")
        found = os.path.isfile(model_path)

    if not found:
        raise ValueError(
            f"No model found for {algo} on {env_id}, path: {model_path}")

    off_policy_algos = ["qrdqn", "dqn", "ddpg", "sac", "her", "td3", "tqc"]

    if algo in off_policy_algos:
        args.n_envs = 1

    set_random_seed(args.seed)

    if args.num_threads > 0:
        if args.verbose > 1:
            print(f"Setting torch.num_threads to {args.num_threads}")
        th.set_num_threads(args.num_threads)

    is_atari = ExperimentManager.is_atari(env_id)

    stats_path = os.path.join(log_path, env_id)
    hyperparams, stats_path = get_saved_hyperparams(
        stats_path, norm_reward=args.norm_reward, test_mode=True)

    # load env_kwargs if existing
    env_kwargs = {}
    args_path = os.path.join(log_path, env_id, "args.yml")
    if os.path.isfile(args_path):
        with open(args_path, "r") as f:
            loaded_args = yaml.load(f, Loader=yaml.UnsafeLoader)  # pytype: disable=module-attr
            if loaded_args["env_kwargs"] is not None:
                env_kwargs = loaded_args["env_kwargs"]
    # overwrite with command line arguments
    if args.env_kwargs is not None:
        env_kwargs.update(args.env_kwargs)

    log_dir = args.reward_log if args.reward_log != "" else None

    print(env_kwargs)

    env = create_test_env(
        env_id,
        n_envs=args.n_envs,
        stats_path=stats_path,
        seed=args.seed,
        log_dir=log_dir,
        should_render=not args.no_render,
        hyperparams=hyperparams,
        env_kwargs=env_kwargs,
    )

    kwargs = dict(seed=args.seed)
    if algo in off_policy_algos:
        # Dummy buffer size as we don't need memory to enjoy the trained agent
        kwargs.update(dict(buffer_size=1))

    # Check if we are running python 3.8+
    # we need to patch saved model under python 3.6/3.7 to load them
    newer_python_version = sys.version_info.major == 3 and sys.version_info.minor >= 8

    custom_objects = {}
    if newer_python_version:
        custom_objects = {
            "learning_rate": 0.0,
            "lr_schedule": lambda _: 0.0,
            "clip_range": lambda _: 0.0,
        }

    model = ALGOS[algo].load(model_path,
                             env=env,
                             custom_objects=custom_objects,
                             **kwargs)

    obs = env.reset()

    # Deterministic by default except for atari games
    stochastic = args.stochastic or is_atari and not args.deterministic
    deterministic = not stochastic

    state = None
    episode_reward = 0.0
    episode_rewards, episode_lengths = [], []
    ep_len = 0
    # For HER, monitor success rate
    successes = []

    plt.figure(f"Enjoy {env_id}")
    plt.title(f"{env_id}", fontsize=14)

    plt.xlabel(f"Timesteps", fontsize=14)
    # plt.ylabel("Score", fontsize=14)

    observations = []
    rewards = []
    infos = []

    try:
        for _ in range(args.n_timesteps):
            action, state = model.predict(obs,
                                          state=state,
                                          deterministic=deterministic)
            obs, reward, done, info = env.step(action)
            if not args.no_render:
                env.render("human")

            episode_reward += reward[0]
            ep_len += 1

            observations.append(obs)
            rewards.append(reward)
            infos.append(info[0].get("coating"))

            if args.n_envs == 1:
                # For atari the return reward is not the atari score
                # so we have to get it from the infos dict
                if is_atari and infos is not None and args.verbose >= 1:
                    episode_infos = infos[0].get("episode")
                    if episode_infos is not None:
                        print(f"Atari Episode Score: {episode_infos['r']:.2f}")
                        print("Atari Episode Length", episode_infos["l"])

                if done and not is_atari and args.verbose > 0:
                    # NOTE: for env using VecNormalize, the mean reward
                    # is a normalized reward when `--norm_reward` flag is passed
                    print(f"Episode Reward: {episode_reward:.2f}")
                    print("Episode Length", ep_len)
                    episode_rewards.append(episode_reward)
                    episode_lengths.append(ep_len)
                    episode_reward = 0.0
                    ep_len = 0
                    state = None

                # Reset also when the goal is achieved when using HER
                if done and infos[0].get("is_success") is not None:
                    if args.verbose > 1:
                        print("Success?", infos[0].get("is_success", False))

                    if infos[0].get("is_success") is not None:
                        successes.append(infos[0].get("is_success", False))
                        episode_reward, ep_len = 0.0, 0

    except KeyboardInterrupt:
        pass

    if args.verbose > 0 and len(successes) > 0:
        print(f"Success rate: {100 * np.mean(successes):.2f}%")

    if args.verbose > 0 and len(episode_rewards) > 0:
        print(f"{len(episode_rewards)} Episodes")
        print(
            f"Mean reward: {np.mean(episode_rewards):.2f} +/- {np.std(episode_rewards):.2f}"
        )

    if args.verbose > 0 and len(episode_lengths) > 0:
        print(
            f"Mean episode length: {np.mean(episode_lengths):.2f} +/- {np.std(episode_lengths):.2f}"
        )

    env.close()

    gesamt = 0
    gesamt_mit = 0
    for el in rewards:
        if (el > 0):
            gesamt += el
        gesamt_mit += el
    print(f"Gesamt reward: {gesamt}")
    print(f"Gesamt reward mit: {gesamt_mit}")

    plt.plot(np.arange(len(observations)),
             rewards,
             label="reward",
             linewidth=1)
    plt.plot(np.arange(len(observations)),
             [obs[0][3] * 202 + 8 for obs in observations],
             label="coating_dist",
             linewidth=1)
    plt.plot(np.arange(len(observations)),
             [obs[0][1] * 202 + 8 for obs in observations],
             label="coating_targets",
             linewidth=1)
    plt.plot(np.arange(len(observations)),
             infos,
             label="coating_real",
             linewidth=1)
    plt.plot(np.arange(len(observations)),
             [obs[0][4] * 700 for obs in observations],
             label="pressure",
             linewidth=1)
    plt.legend()
    plt.show()
Beispiel #9
0
def main():  # noqa: C901
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--env",
        help="environment ID",
        type=str,
        default="CartPole-v1")
    parser.add_argument(
        "-f",
        "--log-folder",
        help="Log folder",
        type=str,
        default="rl-trained-agents")
    parser.add_argument(
        "--algo",
        help="RL Algorithm",
        default="ppo",
        type=str,
        required=False,
        choices=list(ALGOS.keys()))
    parser.add_argument(
        "-n",
        "--n-eval-steps",
        help="Number of evaluation timesteps",
        default=1000,
        type=int)
    parser.add_argument(
        "--num-threads",
        help="Number of threads for PyTorch (-1 to use default)",
        default=-1,
        type=int)
    parser.add_argument(
        "--n-envs",
        help="number of environments",
        default=1,
        type=int)
    parser.add_argument(
        "--exp-id",
        help="Experiment ID (default: 0: latest, -1: no exp folder)",
        default=0,
        type=int)
    parser.add_argument(
        "--verbose",
        help="Verbose mode (0: no output, 1: INFO)",
        default=1,
        type=int)
    parser.add_argument(
        '--render',
        help="1: Render environment, 0: don't render",
        type=int,
        choices=[0, 1],
        default=0)
    parser.add_argument(
        '--deterministic',
        help="1: Use deterministic actions, 0: Use stochastic actions",
        type=int,
        choices=[0, 1],
        default=0)
    parser.add_argument(
        "--load-best",
        action="store_true",
        default=False,
        help="Load best model instead of last model if available")
    parser.add_argument(
        "--load-checkpoint",
        type=int,
        help="Load checkpoint instead of last model if available, "
        "you must pass the number of timesteps corresponding to it",
    )
    parser.add_argument(
        "--stochastic",
        action="store_true",
        default=False,
        help="Use stochastic actions (for DDPG/DQN/SAC)")
    parser.add_argument(
        "--norm-reward",
        action="store_true",
        default=False,
        help="Normalize reward if applicable (trained with VecNormalize)")
    parser.add_argument(
        "--seed",
        help="Random generator seed",
        type=int,
        default=0)
    parser.add_argument(
        "--reward-log",
        help="Where to log reward",
        default="",
        type=str)
    parser.add_argument(
        "--gym-packages",
        type=str,
        nargs="+",
        default=[],
        help="Additional external Gym environemnt package modules to import (e.g. gym_minigrid)")
    parser.add_argument(
        "--env-kwargs",
        type=str,
        nargs="+",
        action=StoreDict,
        help="Optional keyword argument to pass to the env constructor")
    parser.add_argument(
        '--log-info',
        help="1: Log information at each evaluation steps and save, 0: don't log",
        type=int,
        choices=[0, 1],
        default=0)
    parser.add_argument(
        "--plot-dim",
        help="Plot end effector and goal position in real time (0: Don't plot, 2: 2D (default), 3: 3D)",
        type=int,
        default=0,
        choices=[0, 2, 3])
    args = parser.parse_args()

    #################################

    # Prepare log if needed
    if args.log_info:
        log_df = pd.DataFrame()
        log_dict = OrderedDict()

    # Prepare plot if needed
    if args.plot_dim == 2:
        fig, (ax1, ax2) = plt.subplots(2, 1, sharey=True, figsize=(5, 10))
    elif args.plot_dim == 3:
        fig = plt.figure()
        ax = fig.gca(projection='3d')

    # Going through custom gym packages to let them register 
    # in the global registry
    for env_module in args.gym_packages:
        importlib.import_module(env_module)

    env_id = args.env
    algo = args.algo
    folder = args.log_folder

    if args.exp_id == 0:
        args.exp_id = get_latest_run_id(os.path.join(folder, algo), env_id)
        print(f"Loading latest experiment, id={args.exp_id}")

    # Sanity checks
    if args.exp_id > 0:
        log_path = os.path.join(folder, algo, f"{env_id}_{args.exp_id}")
    else:
        log_path = os.path.join(folder, algo)

    assert os.path.isdir(log_path), f"The {log_path} folder was not found"

    found = False
    for ext in ["zip"]:
        model_path = os.path.join(log_path, f"{env_id}.{ext}")
        found = os.path.isfile(model_path)
        if found:
            break

    if args.load_best:
        model_path = os.path.join(log_path, "best_model.zip")
        found = os.path.isfile(model_path)

    if args.load_checkpoint is not None:
        model_path = os.path.join(
            log_path, f"rl_model_{args.load_checkpoint}_steps.zip")
        found = os.path.isfile(model_path)

    if not found:
        raise ValueError(
            f"No model found for {algo} on {env_id}, path: {model_path}")

    off_policy_algos = ["dqn", "ddpg", "sac", "her", "td3", "tqc"]

    if algo in off_policy_algos:
        args.n_envs = 1

    set_random_seed(args.seed)

    if args.num_threads > 0:
        if args.verbose > 1:
            print(f"Setting torch.num_threads to {args.num_threads}")
        th.set_num_threads(args.num_threads)

    stats_path = os.path.join(log_path, env_id)
    hyperparams, stats_path = get_saved_hyperparams(
        stats_path, norm_reward=args.norm_reward, test_mode=True)

    # load env_kwargs if existing
    env_kwargs = {}
    args_path = os.path.join(log_path, env_id, "args.yml")
    if os.path.isfile(args_path):
        with open(args_path, "r") as f:
            # pytype: disable=module-attr
            loaded_args = yaml.load(f, Loader=yaml.UnsafeLoader)
            if loaded_args["env_kwargs"] is not None:
                env_kwargs = loaded_args["env_kwargs"]
    # overwrite with command line arguments
    if args.env_kwargs is not None:
        env_kwargs.update(args.env_kwargs)

    log_dir = args.reward_log if args.reward_log != "" else None

    env = create_test_env(
        env_id,
        n_envs=args.n_envs,
        stats_path=stats_path,
        seed=args.seed,
        log_dir=log_dir,
        should_render=args.render,
        hyperparams=hyperparams,
        env_kwargs=env_kwargs,
    )

    kwargs = dict(seed=args.seed)
    if algo in off_policy_algos:
        # Dummy buffer size as we don't need memory to enjoy the trained agent
        kwargs.update(dict(buffer_size=1))

    model = ALGOS[algo].load(model_path, env=env, **kwargs)

    obs = env.reset()

    # Force deterministic for DQN, DDPG, SAC and HER (that is a wrapper around)
    deterministic = args.deterministic or algo in off_policy_algos and not args.stochastic

    state = None
    episode_reward = 0.0
    episode_rewards, episode_lengths = [], []
    ep_len = 0
    successes = []  # For HER, monitor success rate

    episode_nb = 0
    success_threshold_50 = 0.05
    success_threshold_20 = 0.02
    success_threshold_10 = 0.01
    success_threshold_5 = 0.005
    success_threshold_2 = 0.002
    success_threshold_1 = 0.001
    success_threshold_05 = 0.0005
    ep_success_list_50 = []
    ep_success_list_20 = []
    ep_success_list_10 = []
    ep_success_list_5 = []
    ep_success_list_2 = []
    ep_success_list_1 = []
    ep_success_list_05 = []
    success_list_50 = []
    success_list_20 = []
    success_list_10 = []
    success_list_5 = []
    success_list_2 = []
    success_list_1 = []
    success_list_05 = []

    # Moved render flag outside the loop (Pierre)
    if args.render:
        env.render("human")

    for t in range(args.n_eval_steps):
        action, state = model.predict(
            obs, state=state, deterministic=deterministic)
        obs, reward, done, infos = env.step(action)

        # Slow down simulation when rendering (Pierre)
        if args.render:
            if "widowx" in env_id:
                time.sleep(1. / 30.)
            else:
                env.render()

        if "widowx" in env_id:
            # Update episode success list
            ep_success_list_50 = calc_ep_success(
                success_threshold_50, ep_success_list_50, infos)
            ep_success_list_20 = calc_ep_success(
                success_threshold_20, ep_success_list_20, infos)
            ep_success_list_10 = calc_ep_success(
                success_threshold_10, ep_success_list_10, infos)
            ep_success_list_5 = calc_ep_success(
                success_threshold_5, ep_success_list_5, infos)
            ep_success_list_2 = calc_ep_success(
                success_threshold_2, ep_success_list_2, infos)
            ep_success_list_1 = calc_ep_success(
                success_threshold_1, ep_success_list_1, infos)
            ep_success_list_05 = calc_ep_success(
                success_threshold_05, ep_success_list_05, infos)

        episode_reward += reward[0]
        ep_len += 1

        # Real time plot
        if args.plot_dim == 2:

            goal = infos[0]['goal_position']
            tip = infos[0]['tip_position']

            ax1.cla()
            ax1.plot(goal[0], goal[2], marker='o', color='g',
                     linestyle='', markersize=10, label="goal", alpha=0.5)
            ax1.plot(tip[0], tip[2], marker='x', color='r',
                     linestyle='', markersize=10, label="end effector", mew=3)

            circ_1_50 = plt.Circle(
                (goal[0],
                 goal[2]),
                radius=success_threshold_50,
                edgecolor='g',
                facecolor='w',
                linestyle='--',
                label="50 mm")
            circ_1_20 = plt.Circle(
                (goal[0],
                 goal[2]),
                radius=success_threshold_20,
                edgecolor='b',
                facecolor='w',
                linestyle='--',
                label="20 mm")
            circ_1_10 = plt.Circle(
                (goal[0],
                 goal[2]),
                radius=success_threshold_10,
                edgecolor='m',
                facecolor='w',
                linestyle='--',
                label="10 mm")
            circ_1_5 = plt.Circle(
                (goal[0],
                 goal[2]),
                radius=success_threshold_5,
                edgecolor='r',
                facecolor='w',
                linestyle='--',
                label="5 mm")
            ax1.add_patch(circ_1_50)
            ax1.add_patch(circ_1_20)
            ax1.add_patch(circ_1_10)
            ax1.add_patch(circ_1_5)

            ax1.set_xlim([-0.25, 0.25])
            ax1.set_ylim([0, 0.5])
            ax1.set_xlabel("x (m)", fontsize=15)
            ax1.set_ylabel("z (m)", fontsize=15)

            ax2.cla()
            ax2.plot(goal[1], goal[2], marker='o', color='g',
                     linestyle='', markersize=10, alpha=0.5)
            ax2.plot(
                tip[1],
                tip[2],
                marker='x',
                color='r',
                linestyle='',
                markersize=10,
                mew=3)

            circ_2_50 = plt.Circle(
                (goal[1],
                 goal[2]),
                radius=success_threshold_50,
                edgecolor='g',
                facecolor='w',
                linestyle='--')
            circ_2_20 = plt.Circle(
                (goal[1],
                 goal[2]),
                radius=success_threshold_20,
                edgecolor='b',
                facecolor='w',
                linestyle='--')
            circ_2_10 = plt.Circle(
                (goal[1],
                 goal[2]),
                radius=success_threshold_10,
                edgecolor='m',
                facecolor='w',
                linestyle='--')
            circ_2_5 = plt.Circle(
                (goal[1],
                 goal[2]),
                radius=success_threshold_5,
                edgecolor='r',
                facecolor='w',
                linestyle='--')
            ax2.add_patch(circ_2_50)
            ax2.add_patch(circ_2_20)
            ax2.add_patch(circ_2_10)
            ax2.add_patch(circ_2_5)

            ax2.set_xlim([-0.25, 0.25])
            ax2.set_ylim([0, 0.5])
            ax2.set_xlabel("y (m)", fontsize=15)
            ax2.set_ylabel("z (m)", fontsize=15)

            ax1.legend(loc='upper left', bbox_to_anchor=(
                0, 1.2), ncol=3, fancybox=True, shadow=True)

            fig.suptitle("timestep " + str(ep_len) + " | distance to target: " +
                         str(round(infos[0]['new_distance'] * 1000, 1)) + " mm")
            plt.pause(0.01)
            # plt.show()

        elif args.plot_dim == 3:

            goal = infos[0]['goal_position']
            tip = infos[0]['tip_position']

            ax.cla()
            ax.plot([goal[0]], [goal[1]], zs=[goal[2]], marker='o',
                    color='g', linestyle='', markersize=10, alpha=0.5)
            ax.plot([tip[0]], [tip[1]], zs=[tip[2]], marker='x',
                    color='r', linestyle='', markersize=10, mew=3)
            ax.set_xlim([-0.2, 0.2])
            ax.set_ylim([-0.2, 0.2])
            ax.set_zlim([0, 0.5])
            ax.set_xlabel("x (m)", fontsize=15)
            ax.set_ylabel("y (m)", fontsize=15)
            ax.set_zlabel("z (m)", fontsize=15)

            fig.suptitle("timestep " + str(ep_len) + " | distance to target: " +
                         str(round(infos[0]['new_distance'] * 1000, 1)) + " mm")
            plt.pause(0.01)
            # plt.show()

        if args.log_info:

            log_dict['episode'] = episode_nb
            log_dict['timestep'] = t
            log_dict['action_1'] = action[0][0]
            log_dict['action_2'] = action[0][1]
            log_dict['action_3'] = action[0][2]
            log_dict['action_4'] = action[0][3]
            log_dict['action_5'] = action[0][4]
            log_dict['action_6'] = action[0][5]
            log_dict['old_joint_pos_1'] = infos[0]['old_joint_pos'][0]
            log_dict['old_joint_pos_2'] = infos[0]['old_joint_pos'][1]
            log_dict['old_joint_pos_3'] = infos[0]['old_joint_pos'][2]
            log_dict['old_joint_pos_4'] = infos[0]['old_joint_pos'][3]
            log_dict['old_joint_pos_5'] = infos[0]['old_joint_pos'][4]
            log_dict['old_joint_pos_6'] = infos[0]['old_joint_pos'][5]
            log_dict['new_joint_pos_1'] = infos[0]['new_joint_pos'][0]
            log_dict['new_joint_pos_2'] = infos[0]['new_joint_pos'][1]
            log_dict['new_joint_pos_3'] = infos[0]['new_joint_pos'][2]
            log_dict['new_joint_pos_4'] = infos[0]['new_joint_pos'][3]
            log_dict['new_joint_pos_5'] = infos[0]['new_joint_pos'][4]
            log_dict['new_joint_pos_6'] = infos[0]['new_joint_pos'][5]
            log_dict['joint_vel_1'] = infos[0]['joint_vel'][0]
            log_dict['joint_vel_2'] = infos[0]['joint_vel'][1]
            log_dict['joint_vel_3'] = infos[0]['joint_vel'][2]
            log_dict['joint_vel_4'] = infos[0]['joint_vel'][3]
            log_dict['joint_vel_5'] = infos[0]['joint_vel'][4]
            log_dict['joint_vel_6'] = infos[0]['joint_vel'][5]
            log_dict['joint1_min'] = -3.1
            log_dict['joint1_max'] = 3.1
            log_dict['joint2_min'] = -1.571
            log_dict['joint2_max'] = 1.571
            log_dict['joint3_min'] = -1.571
            log_dict['joint3_max'] = 1.571
            log_dict['joint4_min'] = -1.745
            log_dict['joint4_max'] = 1.745
            log_dict['joint5_min'] = -2.617
            log_dict['joint5_max'] = 2.617
            log_dict['joint6_min'] = 0.003
            log_dict['joint6_max'] = 0.03
            log_dict['action_low1'] = env.action_space.low[0]
            log_dict['action_low2'] = env.action_space.low[1]
            log_dict['action_low3'] = env.action_space.low[2]
            log_dict['action_low4'] = env.action_space.low[3]
            log_dict['action_low5'] = env.action_space.low[4]
            log_dict['action_low6'] = env.action_space.low[5]
            log_dict['action_high1'] = env.action_space.high[0]
            log_dict['action_high2'] = env.action_space.high[1]
            log_dict['action_high3'] = env.action_space.high[2]
            log_dict['action_high4'] = env.action_space.high[3]
            log_dict['action_high5'] = env.action_space.high[4]
            log_dict['action_high6'] = env.action_space.high[5]
            log_dict['reward'] = reward[0]
            log_dict['return'] = episode_reward
            log_dict['new_distance'] = infos[0]['new_distance']
            log_dict['old_distance'] = infos[0]['old_distance']
            log_dict['target_x'] = infos[0]['goal_position'][0]
            log_dict['target_y'] = infos[0]['goal_position'][1]
            log_dict['target_z'] = infos[0]['goal_position'][2]
            log_dict['tip_y'] = infos[0]['tip_position'][1]
            log_dict['tip_x'] = infos[0]['tip_position'][0]
            log_dict['tip_z'] = infos[0]['tip_position'][2]
            log_dict['done'] = done[0]
            # log_dict['obs'] = obs
            # log_dict['obs_space_low'] = env.observation_space.low
            # log_dict['obs_space_high'] = env.observation_space.high

            log_df = log_df.append(log_dict, ignore_index=True)

        if args.n_envs == 1:

            if done and args.verbose > 0:
                # NOTE: for env using VecNormalize, the mean reward
                # is a normalized reward when `--norm_reward` flag is passed
                # print(f"Episode Reward: {episode_reward:.2f}") # commented by Pierre
                # print("Episode Length", ep_len)  # commented by Pierre
                episode_rewards.append(episode_reward)
                episode_lengths.append(ep_len)
                episode_nb += 1

                if "widowx" in env_id:
                    # append the last element of the episode success list when
                    # episode is done
                    success_list_50 = calc_success_list(
                        ep_success_list_50, success_list_50)
                    success_list_20 = calc_success_list(
                        ep_success_list_20, success_list_20)
                    success_list_10 = calc_success_list(
                        ep_success_list_10, success_list_10)
                    success_list_5 = calc_success_list(
                        ep_success_list_5, success_list_5)
                    success_list_2 = calc_success_list(
                        ep_success_list_2, success_list_2)
                    success_list_1 = calc_success_list(
                        ep_success_list_1, success_list_1)
                    success_list_05 = calc_success_list(
                        ep_success_list_05, success_list_05)

                    # If the episode is successful and it starts from an
                    # unsucessful step, calculate reach time
                    reachtime_list_50 = calc_reach_time(ep_success_list_50)
                    reachtime_list_20 = calc_reach_time(ep_success_list_20)
                    reachtime_list_10 = calc_reach_time(ep_success_list_10)
                    reachtime_list_5 = calc_reach_time(ep_success_list_5)
                    reachtime_list_2 = calc_reach_time(ep_success_list_2)
                    reachtime_list_1 = calc_reach_time(ep_success_list_1)
                    reachtime_list_05 = calc_reach_time(ep_success_list_05)

                if args.log_info:
                    log_df = log_df[log_dict.keys()]  # sort columns

                    # add estimated tip velocity and acceleration (according to
                    # the documentation, 1 timestep = 240 Hz)
                    log_df['est_vel'] = log_df['new_distance'].diff() * 240
                    log_df['est_vel'].loc[0] = 0    # initial velocity is 0
                    log_df['est_acc'] = log_df['est_vel'].diff() * 240
                    log_df['est_acc'].loc[0] = 0    # initial acceleration is 0

                    log_df.to_csv(
                        log_path +
                        "/res_episode_" +
                        str(episode_nb) +
                        ".csv",
                        index=False)  # slow
                    # log_df.to_pickle(log_path+"/res_episode_"+str(episode)+".pkl")
                    # # fast

                # Reset for the new episode
                episode_reward = 0.0
                ep_len = 0
                state = None
                ep_success_list_50 = []
                ep_success_list_20 = []
                ep_success_list_10 = []
                ep_success_list_5 = []
                ep_success_list_2 = []
                ep_success_list_1 = []
                ep_success_list_05 = []

            # Reset also when the goal is achieved when using HER
            if done and infos[0].get("is_success") is not None:
                if args.verbose > 1:
                    print("Success?", infos[0].get("is_success", False))
                # Alternatively, you can add a check to wait for the end of the
                # episode
                if done:
                    obs = env.reset()
                if infos[0].get("is_success") is not None:
                    successes.append(infos[0].get("is_success", False))
                    episode_reward, ep_len = 0.0, 0

    if args.verbose > 0 and len(successes) > 0:
        print(f"Success rate: {100 * np.mean(successes):.2f}%")

    if args.verbose > 0 and len(episode_lengths) > 0:
        print(
            f"Mean episode length: {np.mean(episode_lengths):.2f} +/- {np.std(episode_lengths):.2f}")

    if args.verbose > 0 and len(episode_rewards) > 0:
        print(
            f"Mean reward: {np.mean(episode_rewards):.2f} +/- {np.std(episode_rewards):.2f}")

        if "widowx" in env_id:
            SR_mean_50, RT_mean_50 = calc_mean_successratio_reachtime(
                success_threshold_50, success_list_50, reachtime_list_50)
            SR_mean_20, RT_mean_20 = calc_mean_successratio_reachtime(
                success_threshold_20, success_list_20, reachtime_list_20)
            SR_mean_10, RT_mean_10 = calc_mean_successratio_reachtime(
                success_threshold_10, success_list_10, reachtime_list_10)
            SR_mean_5, RT_mean_5 = calc_mean_successratio_reachtime(
                success_threshold_5, success_list_5, reachtime_list_5)
            SR_mean_2, RT_mean_2 = calc_mean_successratio_reachtime(
                success_threshold_2, success_list_2, reachtime_list_2)
            SR_mean_1, RT_mean_1 = calc_mean_successratio_reachtime(
                success_threshold_1, success_list_1, reachtime_list_1)
            SR_mean_05, RT_mean_05 = calc_mean_successratio_reachtime(
                success_threshold_05, success_list_05, reachtime_list_05)

            # log metrics to stats.csv
            d = {
                "Eval mean reward": np.mean(episode_rewards),
                "Eval std": np.std(episode_rewards),
                "success ratio 50mm": SR_mean_50,
                "Average reach time 50mm": RT_mean_50,
                "success ratio 20mm": SR_mean_20,
                "Average reach time 20mm": RT_mean_20,
                "success ratio 10mm": SR_mean_10,
                "Average reach time 10mm": RT_mean_10,
                "success ratio 5mm": SR_mean_5,
                "Average reach time 5mm": RT_mean_5,
                "success ratio 2mm": SR_mean_2,
                "Average reach time 2mm": RT_mean_2,
                "success ratio 1mm": SR_mean_1,
                "Average reach time 1mm": RT_mean_1,
                "success ratio 0.5mm": SR_mean_05,
                "Average reach time 0.5mm": RT_mean_05
            }

            # print("path:", log_path)
            df = pd.DataFrame(d, index=[0])
            df.to_csv(log_path + "/stats.csv", index=False)

    # Workaround for https://github.com/openai/gym/issues/893
    if args.render:
        if args.n_envs == 1 and "Bullet" not in env_id and isinstance(
                env, VecEnv):
            # DummyVecEnv
            # Unwrap env
            while isinstance(env, VecEnvWrapper):
                env = env.venv
            if isinstance(env, DummyVecEnv):
                env.envs[0].env.close()
            else:
                env.close()
        else:
            # SubprocVecEnv
            env.close()
Beispiel #10
0
def main():  # noqa: C901
    parser = argparse.ArgumentParser()
    parser.add_argument("--env",
                        help="environment ID",
                        type=str,
                        default="CartPole-v1")
    parser.add_argument("-f",
                        "--folder",
                        help="Log folder",
                        type=str,
                        default="rl-trained-agents")
    parser.add_argument("--algo",
                        help="RL Algorithm",
                        default="ppo",
                        type=str,
                        required=False,
                        choices=list(ALGOS.keys()))
    parser.add_argument("-n",
                        "--n-timesteps",
                        help="number of timesteps",
                        default=300,
                        type=int)
    parser.add_argument(
        "--num-threads",
        help="Number of threads for PyTorch (-1 to use default)",
        default=-1,
        type=int)
    parser.add_argument("--n-envs",
                        help="number of environments",
                        default=1,
                        type=int)
    parser.add_argument(
        "--exp-id",
        help="Experiment ID (default: 0: latest, -1: no exp folder)",
        default=0,
        type=int)
    parser.add_argument("--verbose",
                        help="Verbose mode (0: no output, 1: INFO)",
                        default=1,
                        type=int)
    parser.add_argument(
        "--no-render",
        action="store_true",
        default=False,
        help="Do not render the environment (useful for tests)")
    parser.add_argument("--deterministic",
                        action="store_true",
                        default=False,
                        help="Use deterministic actions")
    parser.add_argument(
        "--load-best",
        action="store_true",
        default=False,
        help="Load best model instead of last model if available")
    parser.add_argument(
        "--load-checkpoint",
        type=int,
        help="Load checkpoint instead of last model if available, "
        "you must pass the number of timesteps corresponding to it",
    )
    parser.add_argument("--stochastic",
                        action="store_true",
                        default=False,
                        help="Use stochastic actions")
    parser.add_argument(
        "--norm-reward",
        action="store_true",
        default=False,
        help="Normalize reward if applicable (trained with VecNormalize)")
    parser.add_argument("--seed",
                        help="Random generator seed",
                        type=int,
                        default=0)
    parser.add_argument("--reward-log",
                        help="Where to log reward",
                        default="",
                        type=str)
    parser.add_argument(
        "--gym-packages",
        type=str,
        nargs="+",
        default=[],
        help=
        "Additional external Gym environemnt package modules to import (e.g. gym_minigrid)",
    )
    parser.add_argument(
        "--env-kwargs",
        type=str,
        nargs="+",
        action=StoreDict,
        help="Optional keyword argument to pass to the env constructor")
    args = parser.parse_args()

    # Going through custom gym packages to let them register in the global registory
    for env_module in args.gym_packages:
        importlib.import_module(env_module)

    env_id = args.env
    algo = args.algo
    folder = args.folder

    if args.exp_id == 0:
        args.exp_id = get_latest_run_id(os.path.join(folder, algo), env_id)
        print(f"Loading latest experiment, id={args.exp_id}")

    # Sanity checks
    if args.exp_id > 0:
        log_path = os.path.join(folder, algo, f"{env_id}_{args.exp_id}")
    else:
        log_path = os.path.join(folder, algo)

    assert os.path.isdir(log_path), f"The {log_path} folder was not found"

    found = False
    for ext in ["zip"]:
        model_path = os.path.join(log_path, f"{env_id}.{ext}")
        found = os.path.isfile(model_path)
        if found:
            break

    if args.load_best:
        model_path = os.path.join(log_path, "best_model.zip")
        found = os.path.isfile(model_path)

    if args.load_checkpoint is not None:
        model_path = os.path.join(
            log_path, f"rl_model_{args.load_checkpoint}_steps.zip")
        found = os.path.isfile(model_path)

    if not found:
        raise ValueError(
            f"No model found for {algo} on {env_id}, path: {model_path}")

    off_policy_algos = ["qrdqn", "dqn", "ddpg", "sac", "her", "td3", "tqc"]

    if algo in off_policy_algos:
        args.n_envs = 1

    set_random_seed(args.seed)

    if args.num_threads > 0:
        if args.verbose > 1:
            print(f"Setting torch.num_threads to {args.num_threads}")
        th.set_num_threads(args.num_threads)

    is_atari = ExperimentManager.is_atari(env_id)

    stats_path = os.path.join(log_path, env_id)
    hyperparams, stats_path = get_saved_hyperparams(
        stats_path, norm_reward=args.norm_reward, test_mode=True)

    # load env_kwargs if existing
    env_kwargs = {}
    args_path = os.path.join(log_path, env_id, "args.yml")
    if os.path.isfile(args_path):
        with open(args_path, "r") as f:
            loaded_args = yaml.load(f, Loader=yaml.UnsafeLoader)  # pytype: disable=module-attr
            if loaded_args["env_kwargs"] is not None:
                env_kwargs = loaded_args["env_kwargs"]
    # overwrite with command line arguments
    if args.env_kwargs is not None:
        env_kwargs.update(args.env_kwargs)

    log_dir = args.reward_log if args.reward_log != "" else None

    env = create_test_env(
        env_id,
        n_envs=args.n_envs,
        stats_path=stats_path,
        seed=args.seed,
        log_dir=log_dir,
        should_render=not args.no_render,
        hyperparams=hyperparams,
        env_kwargs=env_kwargs,
    )

    kwargs = dict(seed=args.seed)
    if algo in off_policy_algos:
        # Dummy buffer size as we don't need memory to enjoy the trained agent
        kwargs.update(dict(buffer_size=1))

    model = ALGOS[algo].load(model_path, env=env, **kwargs)

    obs = env.reset()

    # The following 3 variables are used to create GIFS (the images stacked to create a GIF are those acquired by the vision sensor)
    images = []
    obs = env.reset()
    img = env.render(mode='rgb')

    # Deterministic by default except for atari games
    stochastic = args.stochastic or is_atari and not args.deterministic
    deterministic = not stochastic

    state = None
    episode_reward = 0.0
    episode_rewards, episode_lengths = [], []
    ep_len = 0
    successes = []

    for _ in range(args.n_timesteps):
        images.append(img)
        action, state = model.predict(obs,
                                      state=state,
                                      deterministic=deterministic)
        obs, reward, done, infos = env.step(action)
        img = model.env.render(mode='rgb')
        if not args.no_render:
            env.render("human")

        episode_reward += reward[0]
        ep_len += 1

        if args.n_envs == 1:
            # For atari the return reward is not the atari score
            # so we have to get it from the infos dict
            if is_atari and infos is not None and args.verbose >= 1:
                episode_infos = infos[0].get('episode')
                if episode_infos is not None:
                    print("Atari Episode Score: {:.2f}".format(
                        episode_infos['r']))
                    print("Atari Episode Length", episode_infos['l'])

            if done and not is_atari and args.verbose > 0:
                # NOTE: for env using VecNormalize, the mean reward
                # is a normalized reward when `--norm_reward` flag is passed
                print("Episode Reward: {:.2f}".format(episode_reward))
                print("Episode Length", ep_len)
                state = None
                episode_rewards.append(episode_reward)
                episode_lengths.append(ep_len)
                episode_reward = 0.0
                ep_len = 0

            # Reset also when the goal is achieved when using HER
            if done:
                print("Success?", infos[0].get('is_success', False))
                # Alternatively, you can add a check to wait for the end of the episode
                if done:
                    obs = env.reset()
                    successes.append(infos[0].get('is_success', False))
                    episode_reward = 0.0
                    ep_len = 0

    # Creation of a GIF and saving of rewards and successes in a csv file :
    imageio.mimsave(
        'dVRL_10ep.gif',
        [np.array(img) for i, img in enumerate(images) if i % 2 == 0],
        fps=5)
    savetxt('ep_rewards.csv', episode_rewards, delimiter=',')
    savetxt('ep_success.csv', successes, delimiter=',')

    if args.verbose > 0 and len(successes) > 0:
        print(f"Success rate: {100 * np.mean(successes):.2f}%")

    if args.verbose > 0 and len(episode_rewards) > 0:
        print(
            f"Mean reward: {np.mean(episode_rewards):.2f} +/- {np.std(episode_rewards):.2f}"
        )

    if args.verbose > 0 and len(episode_lengths) > 0:
        print(
            f"Mean episode length: {np.mean(episode_lengths):.2f} +/- {np.std(episode_lengths):.2f}"
        )

    # Workaround for https://github.com/openai/gym/issues/893
    if not args.no_render:
        if args.n_envs == 1 and "Bullet" not in env_id and not is_atari and isinstance(
                env, VecEnv):
            # DummyVecEnv
            # Unwrap env
            while isinstance(env, VecEnvWrapper):
                env = env.venv
            if isinstance(env, DummyVecEnv):
                env.envs[0].env.close()
            else:
                env.close()
        else:
            # SubprocVecEnv
            env.close()
Beispiel #11
0
def main():  # noqa: C901
    parser = argparse.ArgumentParser()
    parser.add_argument('--env',
                        help='environment ID',
                        type=str,
                        default='CartPole-v1')
    parser.add_argument('-f',
                        '--folder',
                        help='Log folder',
                        type=str,
                        default='rl-trained-agents')
    parser.add_argument('--algo',
                        help='RL Algorithm',
                        default='ppo',
                        type=str,
                        required=False,
                        choices=list(ALGOS.keys()))
    parser.add_argument('-n',
                        '--n-timesteps',
                        help='number of timesteps',
                        default=1000,
                        type=int)
    parser.add_argument(
        '--num-threads',
        help='Number of threads for PyTorch (-1 to use default)',
        default=-1,
        type=int)
    parser.add_argument('--n-envs',
                        help='number of environments',
                        default=1,
                        type=int)
    parser.add_argument(
        '--exp-id',
        help='Experiment ID (default: 0: latest, -1: no exp folder)',
        default=0,
        type=int)
    parser.add_argument('--verbose',
                        help='Verbose mode (0: no output, 1: INFO)',
                        default=1,
                        type=int)
    parser.add_argument(
        '--no-render',
        action='store_true',
        default=False,
        help='Do not render the environment (useful for tests)')
    parser.add_argument('--deterministic',
                        action='store_true',
                        default=False,
                        help='Use deterministic actions')
    parser.add_argument(
        '--load-best',
        action='store_true',
        default=False,
        help='Load best model instead of last model if available')
    parser.add_argument(
        '--load-checkpoint',
        type=int,
        help='Load checkpoint instead of last model if available, '
        'you must pass the number of timesteps corresponding to it')
    parser.add_argument('--stochastic',
                        action='store_true',
                        default=False,
                        help='Use stochastic actions (for DDPG/DQN/SAC)')
    parser.add_argument(
        '--norm-reward',
        action='store_true',
        default=False,
        help='Normalize reward if applicable (trained with VecNormalize)')
    parser.add_argument('--seed',
                        help='Random generator seed',
                        type=int,
                        default=0)
    parser.add_argument('--reward-log',
                        help='Where to log reward',
                        default='',
                        type=str)
    parser.add_argument(
        '--gym-packages',
        type=str,
        nargs='+',
        default=[],
        help=
        'Additional external Gym environemnt package modules to import (e.g. gym_minigrid)'
    )
    parser.add_argument(
        '--env-kwargs',
        type=str,
        nargs='+',
        action=StoreDict,
        help='Optional keyword argument to pass to the env constructor')
    args = parser.parse_args()

    # Going through custom gym packages to let them register in the global registory
    for env_module in args.gym_packages:
        importlib.import_module(env_module)

    env_id = args.env
    algo = args.algo
    folder = args.folder

    if args.exp_id == 0:
        args.exp_id = get_latest_run_id(os.path.join(folder, algo), env_id)
        print('Loading latest experiment, id={}'.format(args.exp_id))

    # Sanity checks
    if args.exp_id > 0:
        log_path = os.path.join(folder, algo,
                                '{}_{}'.format(env_id, args.exp_id))
    else:
        log_path = os.path.join(folder, algo)

    assert os.path.isdir(log_path), f"The {log_path} folder was not found"

    found = False
    for ext in ['zip']:
        model_path = os.path.join(log_path, f'{env_id}.{ext}')
        found = os.path.isfile(model_path)
        if found:
            break

    if args.load_best:
        model_path = os.path.join(log_path, "best_model.zip")
        found = os.path.isfile(model_path)

    if args.load_checkpoint is not None:
        model_path = os.path.join(
            log_path, f"rl_model_{args.load_checkpoint}_steps.zip")
        found = os.path.isfile(model_path)

    if not found:
        raise ValueError(
            f"No model found for {algo} on {env_id}, path: {model_path}")

    if algo in ['dqn', 'ddpg', 'sac', 'td3']:
        args.n_envs = 1

    set_random_seed(args.seed)

    if args.num_threads > 0:
        if args.verbose > 1:
            print(f"Setting torch.num_threads to {args.num_threads}")
        th.set_num_threads(args.num_threads)

    is_atari = 'NoFrameskip' in env_id

    stats_path = os.path.join(log_path, env_id)
    hyperparams, stats_path = get_saved_hyperparams(
        stats_path, norm_reward=args.norm_reward, test_mode=True)
    env_kwargs = {} if args.env_kwargs is None else args.env_kwargs

    log_dir = args.reward_log if args.reward_log != '' else None

    env = create_test_env(env_id,
                          n_envs=args.n_envs,
                          stats_path=stats_path,
                          seed=args.seed,
                          log_dir=log_dir,
                          should_render=not args.no_render,
                          hyperparams=hyperparams,
                          env_kwargs=env_kwargs)

    model = ALGOS[algo].load(model_path, env=env)

    obs = env.reset()

    # Force deterministic for DQN, DDPG, SAC and HER (that is a wrapper around)
    deterministic = args.deterministic or algo in [
        'dqn', 'ddpg', 'sac', 'her', 'td3'
    ] and not args.stochastic

    state = None
    episode_reward = 0.0
    episode_rewards, episode_lengths = [], []
    ep_len = 0
    # For HER, monitor success rate
    successes = []
    for _ in range(args.n_timesteps):
        action, state = model.predict(obs,
                                      state=state,
                                      deterministic=deterministic)
        # Random Agent
        # action = [env.action_space.sample()]
        # Clip Action to avoid out of bound errors
        if isinstance(env.action_space, gym.spaces.Box):
            action = np.clip(action, env.action_space.low,
                             env.action_space.high)
        obs, reward, done, infos = env.step(action)
        if not args.no_render:
            env.render('human')

        episode_reward += reward[0]
        ep_len += 1

        if args.n_envs == 1:
            # For atari the return reward is not the atari score
            # so we have to get it from the infos dict
            if is_atari and infos is not None and args.verbose >= 1:
                episode_infos = infos[0].get('episode')
                if episode_infos is not None:
                    print(f"Atari Episode Score: {episode_infos['r']:.2f}")
                    print("Atari Episode Length", episode_infos['l'])

            if done and not is_atari and args.verbose > 0:
                # NOTE: for env using VecNormalize, the mean reward
                # is a normalized reward when `--norm_reward` flag is passed
                print(f"Episode Reward: {episode_reward:.2f}")
                print("Episode Length", ep_len)
                episode_rewards.append(episode_reward)
                episode_lengths.append(ep_len)
                episode_reward = 0.0
                ep_len = 0
                state = None

            # Reset also when the goal is achieved when using HER
            if done and infos[0].get('is_success') is not None:
                if args.verbose > 1:
                    print("Success?", infos[0].get('is_success', False))
                # Alternatively, you can add a check to wait for the end of the episode
                if done:
                    obs = env.reset()
                if infos[0].get('is_success') is not None:
                    successes.append(infos[0].get('is_success', False))
                    episode_reward, ep_len = 0.0, 0

    if args.verbose > 0 and len(successes) > 0:
        print("Success rate: {:.2f}%".format(100 * np.mean(successes)))

    if args.verbose > 0 and len(episode_rewards) > 0:
        print("Mean reward: {:.2f} +/- {:.2f}".format(np.mean(episode_rewards),
                                                      np.std(episode_rewards)))

    if args.verbose > 0 and len(episode_lengths) > 0:
        print("Mean episode length: {:.2f} +/- {:.2f}".format(
            np.mean(episode_lengths), np.std(episode_lengths)))

    # Workaround for https://github.com/openai/gym/issues/893
    if not args.no_render:
        if (args.n_envs == 1 and 'Bullet' not in env_id and not is_atari
                and isinstance(env, VecEnv)):
            # DummyVecEnv
            # Unwrap env
            while isinstance(env, VecEnvWrapper):
                env = env.venv
            if isinstance(env, DummyVecEnv):
                env.envs[0].env.close()
            else:
                env.close()
        else:
            # SubprocVecEnv
            env.close()
Beispiel #12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env',
                        help='environment ID',
                        type=str,
                        default='CartPole-v1')
    parser.add_argument('-f',
                        '--folder',
                        help='Log folder',
                        type=str,
                        default='trained_agents')
    parser.add_argument('--algo',
                        help='RL Algorithm',
                        default='ppo2',
                        type=str,
                        required=False,
                        choices=list(ALGOS.keys()))
    parser.add_argument('-n',
                        '--n-timesteps',
                        help='number of timesteps',
                        default=1000,
                        type=int)
    parser.add_argument('--n-envs',
                        help='number of environments',
                        default=1,
                        type=int)
    parser.add_argument(
        '--exp-id',
        help='Experiment ID (default: -1, no exp folder, 0: latest)',
        default=-1,
        type=int)
    parser.add_argument('--verbose',
                        help='Verbose mode (0: no output, 1: INFO)',
                        default=1,
                        type=int)
    parser.add_argument(
        '--no-render',
        action='store_true',
        default=False,
        help='Do not render the environment (useful for tests)')
    parser.add_argument('--deterministic',
                        action='store_true',
                        default=False,
                        help='Use deterministic actions')
    parser.add_argument('--stochastic',
                        action='store_true',
                        default=False,
                        help='Use stochastic actions (for DDPG/DQN/SAC)')
    parser.add_argument(
        '--load-best',
        action='store_true',
        default=False,
        help='Load best model instead of last model if available')
    parser.add_argument(
        '--norm-reward',
        action='store_true',
        default=False,
        help='Normalize reward if applicable (trained with VecNormalize)')
    parser.add_argument('--seed',
                        help='Random generator seed',
                        type=int,
                        default=0)
    parser.add_argument('--reward-log',
                        help='Where to log reward',
                        default='',
                        type=str)
    parser.add_argument(
        '--gym-packages',
        type=str,
        nargs='+',
        default=[],
        help=
        'Additional external Gym environemnt package modules to import (e.g. gym_minigrid)'
    )
    parser.add_argument(
        '--env-kwargs',
        type=str,
        nargs='+',
        action=StoreDict,
        help='Optional keyword argument to pass to the env constructor')
    parser.add_argument('--render-pybullet',
                        help='Slow down Pybullet simulation to render',
                        default=False)  # added by Pierre
    parser.add_argument('--random-pol', help='Random policy',
                        default=False)  # added by Pierre
    parser.add_argument(
        '--log-dir-random',
        help='Log directory of the random policy')  # added by Pierre
    args = parser.parse_args()

    # Going through custom gym packages to let them register in the global registory
    for env_module in args.gym_packages:
        importlib.import_module(env_module)

    env_id = args.env
    algo = args.algo
    folder = args.folder

    if args.exp_id == 0:
        args.exp_id = get_latest_run_id(os.path.join(folder, algo), env_id)
        print('Loading latest experiment, id={}'.format(args.exp_id))

    # Sanity checks
    if args.exp_id > 0:
        log_path = os.path.join(folder, algo,
                                '{}_{}'.format(env_id, args.exp_id))
    else:
        log_path = os.path.join(folder, algo)

    assert os.path.isdir(log_path), "The {} folder was not found".format(
        log_path)

    if not args.random_pol:  # added by Pierre
        model_path = find_saved_model(algo,
                                      log_path,
                                      env_id,
                                      load_best=args.load_best)

    if algo in ['dqn', 'ddpg', 'sac', 'td3']:
        args.n_envs = 1

    set_global_seeds(args.seed)

    is_atari = 'NoFrameskip' in env_id

    stats_path = os.path.join(log_path, env_id)
    hyperparams, stats_path = get_saved_hyperparams(
        stats_path, norm_reward=args.norm_reward, test_mode=True)

    log_dir = args.reward_log if args.reward_log != '' else None

    env_kwargs = {} if args.env_kwargs is None else args.env_kwargs

    env = create_test_env(env_id,
                          n_envs=args.n_envs,
                          is_atari=is_atari,
                          stats_path=stats_path,
                          seed=args.seed,
                          log_dir=log_dir,
                          should_render=not args.no_render,
                          hyperparams=hyperparams,
                          env_kwargs=env_kwargs)

    # ACER raises errors because the environment passed must have
    # the same number of environments as the model was trained on.
    load_env = None if algo == 'acer' else env

    if not args.random_pol:  # added by Pierre
        model = ALGOS[algo].load(model_path, env=load_env)

    obs = env.reset()

    # Force deterministic for DQN, DDPG, SAC and HER (that is a wrapper around)
    deterministic = args.deterministic or algo in [
        'dqn', 'ddpg', 'sac', 'her', 'td3'
    ] and not args.stochastic

    # INITIALISE METRICS
    episode_reward = 0.0
    episode_rewards, episode_lengths = [], []
    ep_len = 0
    success_threshold_50 = 0.05
    success_list_50, reachtime_list_50, episode_success_list_50 = [], [], []
    success_threshold_20 = 0.02
    success_list_20, reachtime_list_20, episode_success_list_20 = [], [], []
    success_threshold_10 = 0.01
    success_list_10, reachtime_list_10, episode_success_list_10 = [], [], []
    success_threshold_5 = 0.005
    success_list_5, reachtime_list_5, episode_success_list_5 = [], [], []

    # For HER, monitor success rate
    successes = []
    state = None

    for _ in range(args.n_timesteps):

        # Added by Pierre
        if args.random_pol:
            action = [env.action_space.sample()]  # Random Agent
        else:
            action, state = model.predict(obs,
                                          state=state,
                                          deterministic=deterministic)

        # Clip Action to avoid out of bound errors
        if isinstance(env.action_space, gym.spaces.Box):
            action = np.clip(action, env.action_space.low,
                             env.action_space.high)
        obs, reward, done, infos = env.step(action)

        # if args.render_pybullet:
        #     time.sleep(1./30.)     # added by Pierre (slow down Pybullet for rendering)

        # added by Pierre
        if infos[0]['dist_ft_t'] <= success_threshold_50:
            episode_success_list_50.append(1)
        else:
            episode_success_list_50.append(0)

        if infos[0]['dist_ft_t'] <= success_threshold_20:
            episode_success_list_20.append(1)
        else:
            episode_success_list_20.append(0)

        if infos[0]['dist_ft_t'] <= success_threshold_10:
            episode_success_list_10.append(1)
        else:
            episode_success_list_10.append(0)

        if infos[0]['dist_ft_t'] <= success_threshold_5:
            episode_success_list_5.append(1)
        else:
            episode_success_list_5.append(0)

        if not args.no_render:
            env.render('human')
            # env.render(mode="human")

        episode_reward += reward[0]
        ep_len += 1

        if args.n_envs == 1:
            # For atari the return reward is not the atari score
            # so we have to get it from the infos dict
            if is_atari and infos is not None and args.verbose >= 1:
                episode_infos = infos[0].get('episode')
                if episode_infos is not None:
                    print("Atari Episode Score: {:.2f}".format(
                        episode_infos['r']))
                    print("Atari Episode Length", episode_infos['l'])

            if done and not is_atari and args.verbose > 0:
                # NOTE: for env using VecNormalize, the mean reward
                # is a normalized reward when `--norm_reward` flag is passed
                print("Episode Reward: {:.2f}".format(episode_reward))
                print("Episode Length", ep_len)
                episode_rewards.append(episode_reward)
                episode_lengths.append(ep_len)

                # Pierre: append the last element of the episode success list when episode is done
                success_list_50.append(episode_success_list_50[-1])
                success_list_20.append(episode_success_list_20[-1])
                success_list_10.append(episode_success_list_10[-1])
                success_list_5.append(episode_success_list_5[-1])

                # if the episode is successful and it starts from an unsucessful step, calculate reach time
                if episode_success_list_50[
                        -1] == True and episode_success_list_50[0] == False:
                    idx = 0
                    while episode_success_list_50[idx] == False:
                        idx += 1
                    reachtime_list_50.append(idx)

                if episode_success_list_20[
                        -1] == True and episode_success_list_20[0] == False:
                    idx = 0
                    while episode_success_list_20[idx] == False:
                        idx += 1
                    reachtime_list_20.append(idx)

                if episode_success_list_10[
                        -1] == True and episode_success_list_10[0] == False:
                    idx = 0
                    while episode_success_list_10[idx] == False:
                        idx += 1
                    reachtime_list_10.append(idx)

                if episode_success_list_5[
                        -1] == True and episode_success_list_5[0] == False:
                    idx = 0
                    while episode_success_list_5[idx] == False:
                        idx += 1
                    reachtime_list_5.append(idx)

                # RESET FOR NEW EPISODE
                state = None
                episode_reward = 0.0
                ep_len = 0
                episode_success_list_50 = []
                episode_success_list_20 = []
                episode_success_list_10 = []
                episode_success_list_5 = []

            # Reset also when the goal is achieved when using HER
            if done or infos[0].get('is_success', False):
                if args.algo == 'her' and args.verbose > 1:
                    print("Success?", infos[0].get('is_success', False))
                # Alternatively, you can add a check to wait for the end of the episode
                # if done:
                obs = env.reset()
                if args.algo == 'her':
                    successes.append(infos[0].get('is_success', False))
                    episode_reward, ep_len = 0.0, 0

    if args.verbose > 0 and len(successes) > 0:
        print("Success rate: {:.2f}%".format(100 * np.mean(successes)))

    if args.verbose > 0 and len(episode_rewards) > 0:
        print("Mean reward: {:.2f} +/- {:.2f}".format(np.mean(episode_rewards),
                                                      np.std(episode_rewards)))
        print(
            "success threshold: {} | success ratio: {:.2f} | Average reach time: {:.2f}"
            .format(success_threshold_50, np.mean(success_list_50),
                    np.mean(reachtime_list_50)))
        print(
            "success threshold: {} | success ratio: {:.2f} | Average reach time: {:.2f}"
            .format(success_threshold_20, np.mean(success_list_20),
                    np.mean(reachtime_list_20)))
        print(
            "success threshold: {} | success ratio: {:.2f} | Average reach time: {:.2f}"
            .format(success_threshold_10, np.mean(success_list_10),
                    np.mean(reachtime_list_10)))
        print(
            "success threshold: {} | success ratio: {:.2f} | Average reach time: {:.2f}"
            .format(success_threshold_5, np.mean(success_list_5),
                    np.mean(reachtime_list_5)))

    if args.verbose > 0 and len(episode_lengths) > 0:
        print("Mean episode length: {:.2f} +/- {:.2f}".format(
            np.mean(episode_lengths), np.std(episode_lengths)))

    # added by Pierre
    print("path:", log_path)
    d = {
        "Eval mean reward": np.mean(episode_rewards),
        "Eval std": np.std(episode_rewards),
        "success ratio 50mm": np.mean(success_list_50),
        "Average reach time 50mm": np.mean(reachtime_list_50),
        "success ratio 20mm": np.mean(success_list_20),
        "Average reach time 20mm": np.mean(reachtime_list_20),
        "success ratio 10mm": np.mean(success_list_10),
        "Average reach time 10mm": np.mean(reachtime_list_10),
        "success ratio 5mm": np.mean(success_list_5),
        "Average reach time 5mm": np.mean(reachtime_list_5),
    }
    df = pd.DataFrame(d, index=[0])

    if args.random_pol:
        log_rand = args.log_dir_random
        df.to_csv(log_rand + "/stats.csv", index=False)
    else:
        df.to_csv(log_path + "/stats.csv", index=False)

    # Workaround for https://github.com/openai/gym/issues/893
    if not args.no_render:
        if args.n_envs == 1 and 'Bullet' not in env_id and not is_atari and isinstance(
                env, VecEnv):
            # DummyVecEnv
            # Unwrap env
            while isinstance(env, VecNormalize) or isinstance(
                    env, VecFrameStack):
                env = env.venv
            env.envs[0].env.close()
        else:
            # SubprocVecEnv
            env.close()
def rollout_halfspaces(env_id='CartPole-v1',algo='dqn',num_samples=20, precision=0.0001, render=False):
    seed = 0
    folder = 'rl-baselines-zoo/trained_agents'
    n_envs = 1
    no_render = False
    deterministic = True
    stochastic = False
    norm_reward=False

    
    log_path = os.path.join(folder, algo)


    assert os.path.isdir(log_path), "The {} folder was not found".format(log_path)

    model_path = find_saved_model(algo, log_path, env_id, load_best=False)

    
    set_global_seeds(seed)

    is_atari = 'NoFrameskip' in env_id

    stats_path = os.path.join(log_path, env_id)
    hyperparams, stats_path = get_saved_hyperparams(stats_path, norm_reward=norm_reward, test_mode=True)

    log_dir = None

    env_kwargs = {}

    
    env = create_test_env(env_id, n_envs=n_envs, is_atari=is_atari,
                          stats_path=stats_path, seed=seed, log_dir=log_dir,
                          should_render=not no_render,
                          hyperparams=hyperparams, env_kwargs=env_kwargs)

    # ACER raises errors because the environment passed must have
    # the same number of environments as the model was trained on.
    load_env = None if algo == 'acer' else env
    model = ALGOS[algo].load(model_path, env=load_env)

    env = gym.make('CartPole-v1')
    obs = env.reset()

    # Force deterministic for DQN, DDPG, SAC and HER (that is a wrapper around)
    deterministic = deterministic or algo in ['dqn', 'ddpg', 'sac', 'her', 'td3'] and not stochastic

    episode_reward = 0.0
    episode_rewards, episode_lengths = [], []
    ep_len = 0
    # For HER, monitor success rate
    successes = []
    state = None

    embedder = indicator_feature
    halfspaces = {}
    
    for i in range(num_samples):
        print("+"*10)
        #sample random state to start in
        
        #TODO: maybe reset with random actions? How to make it realistic? Does it matter. Let's just try random for now to test weird edge cases.
        obs = env.reset(uniform=True) #sample more uniformly than typical
        start_state = obs.copy()
        print("start state", obs)

        #find out the "near optimal" action for this state to compare other actions to
        opt_action, _ = model.predict(obs, state=state, deterministic=deterministic)
        #take this action
        print("TEACHER ACTION", opt_action)
        obs = env.reset(start_state=start_state)
        print("init state", obs)
        if render:
            env.render()
        # input()
        ep_ret = 0
        fcounts = embedder(start_state)
        #do initial action
        obs, r, done, info = env.step(opt_action) # take a random action

        fcounts += embedder(obs)  #TODO: discount??
        ep_ret += r
        #print(r, obs)
        if done:
            #sample again, since started with terminal state
            continue



        #run tester policy thereafter
        while True:

            #env.render()

            #TODO: sample within allowable range of angle and position
            action, state = model.predict(obs, state=state, deterministic=deterministic)
            # Random Agent
            # action = [env.action_space.sample()]
            # Clip Action to avoid out of bound errors
            if isinstance(env.action_space, gym.spaces.Box):
                action = np.clip(action, env.action_space.low, env.action_space.high)
            #a = env.action_space.sample()
            #print(obs, action)
            obs, r, done, info = env.step(action) # take a random action
            
            fcounts += embedder(obs)
            #print(obs)
            #print(done)
            ep_ret += r
            #print(r, obs)
            if done:
                print("final state", obs)
                print("return", ep_ret)
                print("fcounts", fcounts)
                opt_fcounts = fcounts
                break



        # input()
        #obs = env.reset_state(env.observation_space.sample())

        #rollout once for each action and compute feature counts
        
        
        fcount_vectors = []
        init_actions = []
        ##rollout code:
        for init_action in range(env.action_space.n):
            if init_action == opt_action:
                #don't need to roll this out since we already did
                continue
            print("ACTION", init_action)
            obs = env.reset(start_state=start_state)
            print("init state", obs)
            if render:
                env.render()
            # input()
            ep_ret = 0
            fcounts = embedder(start_state)
            #do initial action
            obs, r, done, info = env.step(init_action) # take a random action

            fcounts += embedder(obs)  #TODO: discount??
            ep_ret += r
            #print(r, obs)
            if done:
                print("final state", obs)
                print("return", ep_ret)
                print("fcounts", fcounts)
                fcount_vectors.append(fcounts)
                init_actions.append(init_action)
                continue



            #run tester policy thereafter
            while True:

                #env.render()

                #TODO: sample within allowable range of angle and position
                action, state = model.predict(obs, state=state, deterministic=deterministic)
                # Random Agent
                # action = [env.action_space.sample()]
                # Clip Action to avoid out of bound errors
                if isinstance(env.action_space, gym.spaces.Box):
                    action = np.clip(action, env.action_space.low, env.action_space.high)
                #a = env.action_space.sample()
                #print(obs, action)
                obs, r, done, info = env.step(action) # take a random action
                
                fcounts += embedder(obs)
                #print(obs)
                #print(done)
                ep_ret += r
                #print(r, obs)
                if done:
                    print("final state", obs)
                    print("return", ep_ret)
                    print("fcounts", fcounts)
                    break

            normal_vector = opt_fcounts - fcounts
            print("action {} over {} => fcount diff = {}".format(opt_fcounts, init_action, normal_vector))
            if np.linalg.norm(normal_vector) > precision:
                halfspaces[tuple(start_state), init_action, opt_action] = normal_vector
        input()
        #TODO: put this inside one of the value alignment verification classes to get sa_fcount_diffs and hopefully reuse that code
        #then visualize test cases

        # input()
    # for _ in range(args.n_timesteps):
    #     action, state = model.predict(obs, state=state, deterministic=deterministic)
    #     # Random Agent
    #     # action = [env.action_space.sample()]
    #     # Clip Action to avoid out of bound errors
    #     if isinstance(env.action_space, gym.spaces.Box):
    #         action = np.clip(action, env.action_space.low, env.action_space.high)
    #     obs, reward, done, infos = env.step(action)
    #     if not args.no_render:
    #         env.render('human')

    #     episode_reward += reward
    #     ep_len += 1

    #     if args.n_envs == 1:
    #         # For atari the return reward is not the atari score
    #         # so we have to get it from the infos dict
    #         if is_atari and infos is not None and args.verbose >= 1:
    #             episode_infos = infos.get('episode')
    #             if episode_infos is not None:
    #                 print("Atari Episode Score: {:.2f}".format(episode_infos['r']))
    #                 print("Atari Episode Length", episode_infos['l'])

    #         if done and not is_atari and args.verbose > 0:
    #             # NOTE: for env using VecNormalize, the mean reward
    #             # is a normalized reward when `--norm_reward` flag is passed
    #             print("Episode Reward: {:.2f}".format(episode_reward))
    #             print("Episode Length", ep_len)
    #             state = None
    #             episode_rewards.append(episode_reward)
    #             episode_lengths.append(ep_len)
    #             episode_reward = 0.0
    #             ep_len = 0

    #         # Reset also when the goal is achieved when using HER
    #         if done or infos.get('is_success', False):
    #             if args.algo == 'her' and args.verbose > 1:
    #                 print("Success?", infos[0].get('is_success', False))
    #             # Alternatively, you can add a check to wait for the end of the episode
    #             # if done:
    #             obs = env.reset()
    #             if args.algo == 'her':
    #                 successes.append(infos[0].get('is_success', False))
    #                 episode_reward, ep_len = 0.0, 0

    # if args.verbose > 0 and len(successes) > 0:
    #     print("Success rate: {:.2f}%".format(100 * np.mean(successes)))

    # if args.verbose > 0 and len(episode_rewards) > 0:
    #     print("Mean reward: {:.2f} +/- {:.2f}".format(np.mean(episode_rewards), np.std(episode_rewards)))

    # if args.verbose > 0 and len(episode_lengths) > 0:
    #     print("Mean episode length: {:.2f} +/- {:.2f}".format(np.mean(episode_lengths), np.std(episode_lengths)))

    # Workaround for https://github.com/openai/gym/issues/893
    if not no_render:
        if n_envs == 1 and 'Bullet' not in env_id and not is_atari and isinstance(env, VecEnv):
            # DummyVecEnv
            # Unwrap env
            while isinstance(env, VecNormalize) or isinstance(env, VecFrameStack):
                env = env.venv
            env.envs[0].env.close()
        else:
            # SubprocVecEnv
            env.close()

    return halfspaces
Beispiel #14
0
    f, _ = plot_results(results, average_group=False, shaded_std=False)
    f.savefig(os.path.join(save_path, 'results.png'),
              bbox_inches='tight',
              format='png')

if args.play > 0:
    test_path = os.path.join(save_path, 'test')
    env_hyperparams = {
        'normalize': normalize,
        'n_stack': n_stack,
        'normalize_kwargs': normalize_kwargs
    }

    env = create_test_env(env_id,
                          n_envs=1,
                          stats_path=params_path,
                          log_dir=test_path,
                          hyperparams=env_hyperparams,
                          env_params=env_params)
    env.reset()

    env = VecVideoRecorder(env,
                           test_path,
                           record_video_trigger=lambda x: x == 0,
                           video_length=args.play,
                           name_prefix="{}-{}-{}-final".format(
                               exp_name, args.algo, env_id))

    obs = env.reset()
    for _ in range(args.play + 1):
        # action = [env.action_space.sample()]
        action, _ = model.predict(obs, deterministic=True)
Beispiel #15
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', help='environment ID', type=str, default='CartPole-v1')
    parser.add_argument('-f', '--folder', help='Log folder', type=str, default='trained_agents')
    parser.add_argument('--algo', help='RL Algorithm', default='ppo2',
                        type=str, required=False, choices=list(ALGOS.keys()))
    parser.add_argument('-n', '--n-timesteps', help='number of timesteps', default=1000,
                        type=int)
    parser.add_argument('--n-envs', help='number of environments', default=1,
                        type=int)
    parser.add_argument('--exp-id', help='Experiment ID (default: -1, no exp folder, 0: latest)', default=-1,
                        type=int)
    parser.add_argument('--verbose', help='Verbose mode (0: no output, 1: INFO)', default=1,
                        type=int)
    parser.add_argument('--no-render', action='store_true', default=False,
                        help='Do not render the environment (useful for tests)')
    parser.add_argument('--deterministic', action='store_true', default=False,
                        help='Use deterministic actions')
    parser.add_argument('--stochastic', action='store_true', default=False,
                        help='Use stochastic actions (for DDPG/DQN/SAC)')
    parser.add_argument('--load-best', action='store_true', default=False,
                        help='Load best model instead of last model if available')
    parser.add_argument('--norm-reward', action='store_true', default=False,
                        help='Normalize reward if applicable (trained with VecNormalize)')
    parser.add_argument('--seed', help='Random generator seed', type=int, default=0)
    parser.add_argument('--reward-log', help='Where to log reward', default='', type=str)
    parser.add_argument('--gym-packages', type=str, nargs='+', default=[], help='Additional external Gym environemnt package modules to import (e.g. gym_minigrid)')
    parser.add_argument('--render-pybullet', help='Slow down Pybullet simulation to render', default=False) # added by Pierre
    parser.add_argument('--random-pol', help='Random policy', default=False) # added by Pierre
    args = parser.parse_args()

    plot_bool = True
    plot_dim = 2
    log_bool = False

    if plot_bool:

        if plot_dim == 2:
            fig, (ax1, ax2) = plt.subplots(2, 1, sharey=True, figsize=(5, 10))
        elif plot_dim == 3:
            fig = plt.figure()
            ax = fig.gca(projection='3d')

    if log_bool:
        output_df = pd.DataFrame()

    # Going through custom gym packages to let them register in the global registory
    for env_module in args.gym_packages:
        importlib.import_module(env_module)

    env_id = args.env
    algo = args.algo
    folder = args.folder

    if args.exp_id == 0:
        args.exp_id = get_latest_run_id(os.path.join(folder, algo), env_id)
        print('Loading latest experiment, id={}'.format(args.exp_id))

    # Sanity checks
    if args.exp_id > 0:
        log_path = os.path.join(folder, algo, '{}_{}'.format(env_id, args.exp_id))
    else:
        log_path = os.path.join(folder, algo)


    assert os.path.isdir(log_path), "The {} folder was not found".format(log_path)

    if not args.random_pol:  # added by Pierre
        model_path = find_saved_model(algo, log_path, env_id, load_best=args.load_best)

    if algo in ['dqn', 'ddpg', 'sac', 'td3']:
        args.n_envs = 1

    set_global_seeds(args.seed)

    is_atari = 'NoFrameskip' in env_id

    stats_path = os.path.join(log_path, env_id)
    hyperparams, stats_path = get_saved_hyperparams(stats_path, norm_reward=args.norm_reward, test_mode=True)

    log_dir = args.reward_log if args.reward_log != '' else None

    env = create_test_env(env_id, n_envs=args.n_envs, is_atari=is_atari,
                          stats_path=stats_path, seed=args.seed, log_dir=log_dir,
                          should_render=not args.no_render,
                          hyperparams=hyperparams)

    # ACER raises errors because the environment passed must have
    # the same number of environments as the model was trained on.
    load_env = None if algo == 'acer' else env
    if not args.random_pol:  # added by Pierre
        model = ALGOS[algo].load(model_path, env=load_env)

    # if not args.no_render:
        # env.render(mode="human")  # added by Pierre (to work with ReachingJaco-v1)
    
    obs = env.reset()

    # Force deterministic for DQN, DDPG, SAC and HER (that is a wrapper around)
    deterministic = args.deterministic or algo in ['dqn', 'ddpg', 'sac', 'her', 'td3'] and not args.stochastic

    episode_reward = 0.0
    episode_rewards, episode_lengths = [], []
    ep_len = 0
    episode = 0

    # success_threshold_001 = 0.01
    # success_list_001, reachtime_list_001, episode_success_list_001 = [], [], []
    # success_threshold_0002 = 0.002
    # success_list_0002, reachtime_list_0002, episode_success_list_0002 = [], [], []
    # success_threshold_0001 = 0.001
    # success_list_0001, reachtime_list_0001, episode_success_list_0001 = [], [], []
    # success_threshold_00005 = 0.0005
    # success_list_00005, reachtime_list_00005, episode_success_list_00005 = [], [], []

    # changed for the paper
    success_threshold_50 = 0.05
    success_list_50, reachtime_list_50, episode_success_list_50 = [], [], []
    success_threshold_20 = 0.02
    success_list_20, reachtime_list_20, episode_success_list_20 = [], [], []
    success_threshold_10 = 0.01
    success_list_10, reachtime_list_10, episode_success_list_10 = [], [], []
    success_threshold_5 = 0.005
    success_list_5, reachtime_list_5, episode_success_list_5 = [], [], []


    # For HER, monitor success rate
    successes = []
    state = None
    
    for _ in range(args.n_timesteps):
        if args.random_pol:
            # Random Agent
            action = [env.action_space.sample()]
        else:
            action, state = model.predict(obs, state=state, deterministic=deterministic)
        
        # Clip Action to avoid out of bound errors
        if isinstance(env.action_space, gym.spaces.Box):
            action = np.clip(action, env.action_space.low, env.action_space.high)
        obs, reward, done, infos = env.step(action)

        if args.render_pybullet:
            time.sleep(1./30.)     # added by Pierre (slow down Pybullet for rendering)
        
        if infos[0]['total_distance'] <= success_threshold_50:
            episode_success_list_50.append(1)
        else:
            episode_success_list_50.append(0)

        if infos[0]['total_distance'] <= success_threshold_20:
            episode_success_list_20.append(1)
        else:
            episode_success_list_20.append(0)

        if infos[0]['total_distance'] <= success_threshold_10:
            episode_success_list_10.append(1)
        else:
            episode_success_list_10.append(0)

        if infos[0]['total_distance'] <= success_threshold_5:
            episode_success_list_5.append(1)
        else:
            episode_success_list_5.append(0)
        

        if plot_bool:
            goal = infos[0]['goal position']
            tip = infos[0]['tip position']

            if plot_dim == 2:
                ax1.cla()
                ax1.plot(goal[0], goal[2], marker='x', color='b', linestyle='', markersize=10, label="goal", mew=3)
                ax1.plot(tip[0], tip[2], marker='o', color='r', linestyle='', markersize=10, label="end effector")

                circ_1_50 = plt.Circle((goal[0], goal[2]), radius=success_threshold_50, edgecolor='g', facecolor='w', linestyle='--', label="50 mm")
                circ_1_20 = plt.Circle((goal[0], goal[2]), radius=success_threshold_20, edgecolor='b', facecolor='w', linestyle='--', label="20 mm")
                circ_1_10 = plt.Circle((goal[0], goal[2]), radius=success_threshold_10, edgecolor='m', facecolor='w', linestyle='--', label="10 mm")
                circ_1_5 = plt.Circle((goal[0], goal[2]), radius=success_threshold_5, edgecolor='r', facecolor='w', linestyle='--', label="5 mm")
                ax1.add_patch(circ_1_50)
                ax1.add_patch(circ_1_20)
                ax1.add_patch(circ_1_10)
                ax1.add_patch(circ_1_5)

                ax1.set_xlim([-0.25, 0.25])
                ax1.set_ylim([0, 0.5])
                ax1.set_xlabel("x (m)", fontsize=15)
                ax1.set_ylabel("z (m)", fontsize=15)

                ax2.cla()
                ax2.plot(goal[1], goal[2], marker='x', color='b', linestyle='', markersize=10, mew=3)
                ax2.plot(tip[1], tip[2], marker='o', color='r', linestyle='', markersize=10)

                circ_2_50 = plt.Circle((goal[1], goal[2]), radius=success_threshold_50, edgecolor='g', facecolor='w', linestyle='--')
                circ_2_20 = plt.Circle((goal[1], goal[2]), radius=success_threshold_20, edgecolor='b', facecolor='w', linestyle='--')
                circ_2_10 = plt.Circle((goal[1], goal[2]), radius=success_threshold_10, edgecolor='m', facecolor='w', linestyle='--')
                circ_2_5 = plt.Circle((goal[1], goal[2]), radius=success_threshold_5, edgecolor='r', facecolor='w', linestyle='--')
                ax2.add_patch(circ_2_50)
                ax2.add_patch(circ_2_20)
                ax2.add_patch(circ_2_10)
                ax2.add_patch(circ_2_5)

                ax2.set_xlim([-0.25, 0.25])
                ax2.set_ylim([0, 0.5])
                ax2.set_xlabel("y (m)", fontsize=15)
                ax2.set_ylabel("z (m)", fontsize=15)

                ax1.legend(loc='upper left', bbox_to_anchor=(0, 1.2), ncol=3, fancybox=True, shadow=True)

            elif plot_dim == 3:
                ax.cla()
                ax.plot([tip[0]], [tip[1]], zs=[tip[2]], marker='x', color='b')
                ax.plot([goal[0]], [goal[1]], zs=[goal[2]], marker='o', color='r', linestyle="None")
                ax.set_xlim([-0.2, 0.2])
                ax.set_ylim([-0.2, 0.2])
                ax.set_zlim([0, 0.5])
                ax.set_xlabel("x (m)", fontsize=15)
                ax.set_ylabel("y (m)", fontsize=15)
                ax.set_zlabel("z (m)", fontsize=15)

            fig.suptitle("timestep "+str(ep_len)+" | distance to target: "+str(round(infos[0]['total_distance']*1000, 1))+" mm")
            plt.pause(0.01)
            # plt.show()

        if log_bool:
            dict_log = infos[0]
            dict_log['action'] = action[0]
            dict_log['obs'] = obs[0]
            dict_log['reward'] = reward[0]
            dict_log['done'] = done[0]
            dict_log['timestep'] = ep_len
            dict_log['episode'] = episode
            output_df = output_df.append(dict_log, ignore_index=True)

        # if not args.no_render:
        #     env.render('human')

        episode_reward += reward[0]
        ep_len += 1

        if args.n_envs == 1:
            # For atari the return reward is not the atari score
            # so we have to get it from the infos dict
            if is_atari and infos is not None and args.verbose >= 1:
                episode_infos = infos[0].get('episode')
                if episode_infos is not None:
                    print("Atari Episode Score: {:.2f}".format(episode_infos['r']))
                    print("Atari Episode Length", episode_infos['l'])
            
            if done and not is_atari and args.verbose > 0:
                # NOTE: for env using VecNormalize, the mean reward
                # is a normalized reward when `--norm_reward` flag is passed
                print("Episode nb: {} | Episode Reward: {:.2f} | Episode Length: {}".format(episode, episode_reward, ep_len))
                # print("Episode Length", ep_len) # commented by Pierre
                state = None
                episode_rewards.append(episode_reward)
                episode_lengths.append(ep_len)

                # append the last element of the episode success list when episode is done
                success_list_50.append(episode_success_list_50[-1]) 
                success_list_20.append(episode_success_list_20[-1]) 
                success_list_10.append(episode_success_list_10[-1]) 
                success_list_5.append(episode_success_list_5[-1])  

                # if the episode is successful and it starts from an unsucessful step, calculate reach time
                if episode_success_list_50[-1] == True and episode_success_list_50[0] == False:
                    idx = 0
                    while episode_success_list_50[idx] == False:
                        idx += 1
                    reachtime_list_50.append(idx)

                if episode_success_list_20[-1] == True and episode_success_list_20[0] == False:
                    idx = 0
                    while episode_success_list_20[idx] == False:
                        idx += 1
                    reachtime_list_20.append(idx)

                if episode_success_list_10[-1] == True and episode_success_list_10[0] == False:
                    idx = 0
                    while episode_success_list_10[idx] == False:
                        idx += 1
                    reachtime_list_10.append(idx)

                if episode_success_list_5[-1] == True and episode_success_list_5[0] == False:
                    idx = 0
                    while episode_success_list_5[idx] == False:
                        idx += 1
                    reachtime_list_5.append(idx)


                if log_bool:
                    # output_df.to_csv(log_path+"/res_episode_"+str(episode)+".csv", index=False)  # slow
                    output_df.to_pickle(log_path+"/res_episode_"+str(episode)+".pkl")

                # reset for new episode
                episode_reward = 0.0
                ep_len = 0
                episode_success_list_50 = []  
                episode_success_list_20 = []  
                episode_success_list_10 = []  
                episode_success_list_5 = []  
                episode += 1 

            # Reset also when the goal is achieved when using HER
            if done or infos[0].get('is_success', False):
                if args.algo == 'her' and args.verbose > 1:
                    print("Success?", infos[0].get('is_success', False))
                # Alternatively, you can add a check to wait for the end of the episode
                # if done:
                obs = env.reset()
                if args.algo == 'her':
                    successes.append(infos[0].get('is_success', False))
                    episode_reward, ep_len = 0.0, 0

    if args.verbose > 0 and len(successes) > 0:
        print("Success rate: {:.2f}%".format(100 * np.mean(successes)))

    if args.verbose > 0 and len(episode_rewards) > 0:
        print("Mean reward: {:.2f} +/- {:.2f}".format(np.mean(episode_rewards), np.std(episode_rewards)))
        print("success threshold: {} | success ratio: {:.2f} | Average reach time: {:.2f}".format(success_threshold_50, np.mean(success_list_50), np.mean(reachtime_list_50)))
        print("success threshold: {} | success ratio: {:.2f} | Average reach time: {:.2f}".format(success_threshold_20, np.mean(success_list_20), np.mean(reachtime_list_20)))
        print("success threshold: {} | success ratio: {:.2f} | Average reach time: {:.2f}".format(success_threshold_10, np.mean(success_list_10), np.mean(reachtime_list_10)))
        print("success threshold: {} | success ratio: {:.2f} | Average reach time: {:.2f}".format(success_threshold_5, np.mean(success_list_5), np.mean(reachtime_list_5)))

        # added by Pierre
        print("path:", log_path)
        d = {
            "Eval mean reward": np.mean(episode_rewards), 
            "Eval std": np.std(episode_rewards), 
            "success ratio 50mm": np.mean(success_list_50),
            "Average reach time 50mm": np.mean(reachtime_list_50),
            "success ratio 20mm": np.mean(success_list_20),
            "Average reach time 20mm": np.mean(reachtime_list_20),
            "success ratio 10mm": np.mean(success_list_10),
            "Average reach time 10mm": np.mean(reachtime_list_10),
            "success ratio 5mm": np.mean(success_list_5),
            "Average reach time 5mm": np.mean(reachtime_list_5),
            }
        df = pd.DataFrame(d, index=[0])

        if args.random_pol:
            df.to_csv("logs/random_policy_0.2M/"+env_id+"/stats.csv", index=False)  # make path naming more robust
        else:
            df.to_csv(log_path+"/stats.csv", index=False)


    if args.verbose > 0 and len(episode_lengths) > 0:
        print("Mean episode length: {:.2f} +/- {:.2f}".format(np.mean(episode_lengths), np.std(episode_lengths)))

    # Workaround for https://github.com/openai/gym/issues/893
    if not args.no_render:
        if args.n_envs == 1 and 'Bullet' not in env_id and not is_atari and isinstance(env, VecEnv):
            # DummyVecEnv
            # Unwrap env
            while isinstance(env, VecNormalize) or isinstance(env, VecFrameStack):
                env = env.venv
            env.envs[0].env.close()
        else:
            # SubprocVecEnv
            env.close()
Beispiel #16
0
input_index = interpreter.get_input_details()[0]["index"]
output_index = interpreter.get_output_details()[0]["index"]

model_path = os.path.join(train_graph_dir, args.env + ".zip")
print("Loading model from", model_path)
log_dir = "/tmp"
hyperparams, stats_path = get_saved_hyperparams(model_path,
                                                norm_reward=False,
                                                test_mode=True)

print("Running ", args.algo, " on ", args.env)
set_global_seeds(0)
env = create_test_env(args.env,
                      n_envs=1,
                      is_atari=is_atari,
                      stats_path=stats_path,
                      seed=0,
                      log_dir=log_dir,
                      should_render=False,
                      hyperparams=hyperparams)

print("Evaluating converted model")
episode_rewards, lengths, norm_rewards = [], [], []
for i in range(100):
    obs, done = env.reset(), False
    episode_rew, norm_rew, length = 0, 0.0, 0
    while not done:
        interpreter.set_tensor(input_index, obs)
        interpreter.invoke()
        predictions = interpreter.get_tensor(output_index)
        action = predictions
        if isinstance(env.action_space,
Beispiel #17
0
    log_path = os.path.join(folder, algo)

model_path = os.path.join(log_path, f"{env_id}.zip")
#model_path = os.path.join(log_path, "best_model.zip")

stats_path = os.path.join(log_path, env_id)
hyperparams, stats_path = get_saved_hyperparams(stats_path)


is_atari = "NoFrameskip" in env_id

env = create_test_env(
    env_id,
    n_envs=n_envs,
    stats_path=stats_path,
    seed=seed,
    log_dir=None,
    should_render=not args.no_render,
    hyperparams=hyperparams,
)

model = ALGOS[algo].load(model_path)

obs = env.reset()

# Note: apparently it renders by default
env = VecVideoRecorder(
    env,
    video_folder,
    record_video_trigger=lambda x: x == 0,
    video_length=video_length,
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env',
                        help='environment ID',
                        type=str,
                        default='CartPole-v1')
    parser.add_argument('-f',
                        '--folder',
                        help='Log folder',
                        type=str,
                        default='trained_agents')
    parser.add_argument('--algo',
                        help='RL Algorithm',
                        default='ppo2',
                        type=str,
                        required=False,
                        choices=list(ALGOS.keys()))
    # parser.add_argument('-n', '--n-timesteps', help='number of timesteps', default=1000,
    # type=int)
    parser.add_argument('-n',
                        '--n-episodes',
                        help='number of episodes to collect',
                        default=20,
                        type=int)
    parser.add_argument('--n-envs',
                        help='number of environments',
                        default=1,
                        type=int)
    parser.add_argument(
        '--exp-id',
        help='Experiment ID (default: -1, no exp folder, 0: latest)',
        default=-1,
        type=int)
    parser.add_argument('--verbose',
                        help='Verbose mode (0: no output, 1: INFO)',
                        default=1,
                        type=int)
    parser.add_argument(
        '--no-render',
        action='store_true',
        default=False,
        help='Do not render the environment (useful for tests)')
    # for deterministic (bool type)
    parser.add_argument('--deterministic',
                        dest='deterministic',
                        action='store_true')
    parser.add_argument('--no-deterministic',
                        dest='deterministic',
                        action='store_false')
    parser.set_defaults(deterministic=True)  # true by default
    # parser.add_argument('--deterministic', action='store_true', default=False,
    # help='Use deterministic actions')
    # parser.add_argument('--stochastic', action='store_true', default=False,
    # help='Use stochastic actions (for DDPG/DQN/SAC)')
    parser.add_argument(
        '--norm-reward',
        action='store_true',
        default=False,
        help='Normalize reward if applicable (trained with VecNormalize)')
    parser.add_argument('--seed',
                        help='Random generator seed',
                        type=int,
                        default=0)
    parser.add_argument('--reward-log',
                        help='Where to log reward',
                        default='',
                        type=str)
    parser.add_argument(
        '--gym-packages',
        type=str,
        nargs='+',
        default=[],
        help=
        'Additional external Gym environemnt package modules to import (e.g. gym_minigrid)'
    )
    args = parser.parse_args()

    # Going through custom gym packages to let them register in the global registory
    for env_module in args.gym_packages:
        importlib.import_module(env_module)

    env_id = args.env
    algo = args.algo
    folder = args.folder

    if args.exp_id == 0:
        args.exp_id = get_latest_run_id(os.path.join(folder, algo), env_id)
        print('Loading latest experiment, id={}'.format(args.exp_id))

    # Sanity checks
    if args.exp_id > 0:
        log_path = os.path.join(folder, algo,
                                '{}_{}'.format(env_id, args.exp_id))
    else:
        log_path = os.path.join(folder, algo)

    assert os.path.isdir(log_path), "The {} folder was not found".format(
        log_path)

    stats_path = os.path.join(log_path, env_id)
    hyperparams, stats_path = get_saved_hyperparams(
        stats_path, norm_reward=args.norm_reward, test_mode=True)
    log_dir = args.reward_log if args.reward_log != '' else None

    if algo in ['dqn', 'ddpg', 'sac', 'td3']:
        args.n_envs = 1
    set_global_seeds(args.seed)
    is_atari = 'NoFrameskip' in env_id

    env = create_test_env(env_id,
                          n_envs=args.n_envs,
                          is_atari=is_atari,
                          stats_path=stats_path,
                          seed=args.seed,
                          log_dir=log_dir,
                          should_render=not args.no_render,
                          hyperparams=hyperparams)

    model_path = find_saved_model(algo, log_path, env_id)
    model = ALGOS[algo].load(model_path, env=env)

    # Force deterministic for DQN, DDPG, SAC and HER (that is a wrapper around)
    # deterministic = args.deterministic or algo in ['dqn', 'ddpg', 'sac', 'her', 'td3'] and not args.stochastic
    deterministic = args.deterministic

    save_dir = os.path.join("expert_trajs_by_info_deterministic_with_std",
                            algo)
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)

    runner(env,
           env_id,
           model,
           args.n_episodes,
           deterministic,
           save=True,
           save_dir=save_dir)
Beispiel #19
0
if algo in ['dqn', 'ddpg']:
    args.n_envs = 1

set_global_seeds(args.seed)

is_atari = 'NoFrameskip' in env_id

stats_path = "{}/{}/{}/".format(folder, algo, env_id)
if not os.path.isdir(stats_path):
    stats_path = None
using_vec_normalize = stats_path is not None
log_dir = args.reward_log if args.reward_log != '' else None

env = create_test_env(env_id, n_envs=args.n_envs, is_atari=is_atari,
                      stats_path=stats_path, norm_reward=args.norm_reward,
                      seed=args.seed, log_dir=log_dir, should_render=not args.no_render)

model = ALGOS[algo].load(model_path)

obs = env.reset()

# Force deterministic for DQN and DDPG
deterministic = args.deterministic or algo in ['dqn', 'ddpg']

running_reward = 0.0
ep_len = 0
for _ in range(args.n_timesteps):
    action, _ = model.predict(obs, deterministic=deterministic)
    # Random Agent
    # action = [env.action_space.sample()]
Beispiel #20
0
def main():  # noqa: C901
    parser = argparse.ArgumentParser()
    parser.add_argument("--env",
                        help="environment ID",
                        type=str,
                        default="Walker2DBulletEnv-v0")
    parser.add_argument("--algo",
                        help="RL Algorithm",
                        default="ppo",
                        type=str,
                        required=False,
                        choices=list(ALGOS.keys()))
    parser.add_argument("-n",
                        "--n-timesteps",
                        help="number of timesteps",
                        default=1000,
                        type=int)
    parser.add_argument(
        "--num-threads",
        help="Number of threads for PyTorch (-1 to use default)",
        default=-1,
        type=int)
    parser.add_argument("--n-envs",
                        help="number of environments",
                        default=1,
                        type=int)
    parser.add_argument(
        "--exp-id",
        help="Experiment ID (default: 0: latest, -1: no exp folder)",
        default=0,
        type=int)
    parser.add_argument("--verbose",
                        help="Verbose mode (0: no output, 1: INFO)",
                        default=1,
                        type=int)
    parser.add_argument(
        "--no-render",
        action="store_true",
        default=False,
        help="Do not render the environment (useful for tests)")
    parser.add_argument("--deterministic",
                        action="store_true",
                        default=True,
                        help="Use deterministic actions")
    parser.add_argument(
        "--load-best",
        action="store_true",
        default=False,
        help="Load best model instead of last model if available")

    parser.add_argument("--stochastic",
                        action="store_true",
                        default=False,
                        help="Use stochastic actions (for DDPG/DQN/SAC)")
    parser.add_argument(
        "--norm-reward",
        action="store_true",
        default=False,
        help="Normalize reward if applicable (trained with VecNormalize)")
    parser.add_argument("--seed",
                        help="Random generator seed",
                        type=int,
                        default=0)
    parser.add_argument("--reward-log",
                        help="Where to log reward",
                        default="",
                        type=str)
    parser.add_argument(
        "--gym-packages",
        type=str,
        nargs="+",
        default=[],
        help=
        "Additional external Gym environemnt package modules to import (e.g. gym_minigrid)",
    )
    parser.add_argument(
        "--env-kwargs",
        type=str,
        nargs="+",
        action=StoreDict,
        help="Optional keyword argument to pass to the env constructor")
    # === #
    parser.add_argument("--load-checkpoint",
                        type=str,
                        help="pass the path of zip file corresponding to it")
    parser.add_argument("-f",
                        "--folder",
                        help="Log folder",
                        type=str,
                        default="rl-trained-agents")
    parser.add_argument("--dataset", type=str, default="dataset/walker2d_v6")
    parser.add_argument("--body-id", type=int, default=0)
    args = parser.parse_args()

    dataset_name, env_id, train_files, train_params, train_names, test_files, test_params, test_names = load_dataset.load_dataset(
        args.dataset, seed=0, shuffle=False, train_proportion=1)

    # Going through custom gym packages to let them register in the global registory
    for env_module in args.gym_packages:
        importlib.import_module(env_module)

    # env_id = args.env
    algo = args.algo
    log_path = args.folder

    # if args.exp_id == 0:
    #     args.exp_id = get_latest_run_id(os.path.join(folder, algo), env_id)
    #     print(f"Loading latest experiment, id={args.exp_id}")

    # # Sanity checks
    # if args.exp_id > 0:
    #     log_path = os.path.join(folder, algo, f"{env_id}_{args.exp_id}")
    # else:
    #     log_path = os.path.join(folder, algo)

    # assert os.path.isdir(log_path), f"The {log_path} folder was not found"

    # found = False
    # for ext in ["zip"]:
    #     model_path = os.path.join(log_path, f"{env_id}.{ext}")
    #     found = os.path.isfile(model_path)
    #     if found:
    #         break

    # if args.load_best:
    #     model_path = os.path.join(log_path, "best_model.zip")
    #     found = os.path.isfile(model_path)

    # if args.load_checkpoint is not None:
    #     model_path = os.path.join(log_path, f"rl_model_{args.load_checkpoint}_steps.zip")
    #     found = os.path.isfile(model_path)

    # if not found:
    #     raise ValueError(f"No model found for {algo} on {env_id}, path: {model_path}")

    model_path = args.load_checkpoint

    if algo in ["dqn", "ddpg", "sac", "td3", "tqc"]:
        args.n_envs = 1

    set_random_seed(args.seed)

    if args.num_threads > 0:
        if args.verbose > 1:
            print(f"Setting torch.num_threads to {args.num_threads}")
        th.set_num_threads(args.num_threads)

    is_atari = "NoFrameskip" in env_id

    stats_path = os.path.join(log_path, env_id)
    hyperparams, stats_path = get_saved_hyperparams(
        stats_path, norm_reward=args.norm_reward, test_mode=True)

    # load env_kwargs if existing
    # env_kwargs = {}
    # args_path = os.path.join(log_path, env_id, "args.yml")
    # if os.path.isfile(args_path):
    #     with open(args_path, "r") as f:
    #         loaded_args = yaml.load(f, Loader=yaml.UnsafeLoader)  # pytype: disable=module-attr
    #         if loaded_args["env_kwargs"] is not None:
    #             env_kwargs = loaded_args["env_kwargs"]
    # # overwrite with command line arguments
    # if args.env_kwargs is not None:
    #     env_kwargs.update(args.env_kwargs)

    args.watch_eval = True

    env_kwargs = {
        "xml": train_files[args.body_id],
        "param": train_params[args.body_id],
        "render": args.watch_eval,
    }
    log_dir = args.reward_log if args.reward_log != "" else None

    env = create_test_env(
        env_id,
        n_envs=args.n_envs,
        stats_path=stats_path,
        seed=args.seed,
        log_dir=log_dir,
        should_render=not args.no_render,
        hyperparams=hyperparams,
        env_kwargs=env_kwargs,
    )

    kwargs = dict(seed=args.seed)
    if algo in ["dqn", "ddpg", "sac", "her", "td3", "tqc"]:
        # Dummy buffer size as we don't need memory to enjoy the trained agent
        kwargs.update(dict(buffer_size=1))

    model = ALGOS[algo].load(model_path, env=env, **kwargs)

    obs = env.reset()

    # Force deterministic for DQN, DDPG, SAC and HER (that is a wrapper around)
    deterministic = args.deterministic or algo in [
        "dqn", "ddpg", "sac", "her", "td3", "tqc"
    ] and not args.stochastic

    state = None
    episode_reward = 0.0
    episode_rewards, episode_lengths = [], []
    ep_len = 0
    # For HER, monitor success rate
    successes = []
    for _ in range(args.n_timesteps):
        action, state = model.predict(obs,
                                      state=state,
                                      deterministic=deterministic)
        # Random Agent
        # action = [env.action_space.sample()]
        # Clip Action to avoid out of bound errors
        if isinstance(env.action_space, gym.spaces.Box):
            action = np.clip(action, env.action_space.low,
                             env.action_space.high)
        obs, reward, done, infos = env.step(action)
        sleep(0.01)
        if not args.no_render:
            env.render("human")

        episode_reward += reward[0]
        ep_len += 1

        if args.n_envs == 1:
            # For atari the return reward is not the atari score
            # so we have to get it from the infos dict
            if is_atari and infos is not None and args.verbose >= 1:
                episode_infos = infos[0].get("episode")
                if episode_infos is not None:
                    print(f"Atari Episode Score: {episode_infos['r']:.2f}")
                    print("Atari Episode Length", episode_infos["l"])

            if done and not is_atari and args.verbose > 0:
                # NOTE: for env using VecNormalize, the mean reward
                # is a normalized reward when `--norm_reward` flag is passed
                print(f"Episode Reward: {episode_reward:.2f}")
                print("Episode Length", ep_len)
                episode_rewards.append(episode_reward)
                episode_lengths.append(ep_len)
                episode_reward = 0.0
                ep_len = 0
                state = None

            # Reset also when the goal is achieved when using HER
            if done and infos[0].get("is_success") is not None:
                if args.verbose > 1:
                    print("Success?", infos[0].get("is_success", False))
                # Alternatively, you can add a check to wait for the end of the episode
                if done:
                    obs = env.reset()
                if infos[0].get("is_success") is not None:
                    successes.append(infos[0].get("is_success", False))
                    episode_reward, ep_len = 0.0, 0

    if args.verbose > 0 and len(successes) > 0:
        print("Success rate: {:.2f}%".format(100 * np.mean(successes)))

    if args.verbose > 0 and len(episode_rewards) > 0:
        print("Mean reward: {:.2f} +/- {:.2f}".format(np.mean(episode_rewards),
                                                      np.std(episode_rewards)))

    if args.verbose > 0 and len(episode_lengths) > 0:
        print("Mean episode length: {:.2f} +/- {:.2f}".format(
            np.mean(episode_lengths), np.std(episode_lengths)))

    # Workaround for https://github.com/openai/gym/issues/893
    if not args.no_render:
        if args.n_envs == 1 and "Bullet" not in env_id and not is_atari and isinstance(
                env, VecEnv):
            # DummyVecEnv
            # Unwrap env
            while isinstance(env, VecEnvWrapper):
                env = env.venv
            if isinstance(env, DummyVecEnv):
                env.envs[0].env.close()
            else:
                env.close()
        else:
            # SubprocVecEnv
            env.close()
Beispiel #21
0
def main():  # noqa: C901
    parser = argparse.ArgumentParser()
    parser.add_argument("-tb",
                        "--tensorboard-log",
                        help="Tensorboard log dir",
                        default="",
                        type=str)
    parser.add_argument("--env",
                        help="environment ID",
                        type=str,
                        default="CartPole-v1")
    parser.add_argument("-f",
                        "--folder",
                        help="Log folder",
                        type=str,
                        default="rl-trained-agents")
    parser.add_argument("--algo",
                        help="RL Algorithm",
                        default="ppo",
                        type=str,
                        required=False,
                        choices=list(ALGOS.keys()))
    parser.add_argument("-n",
                        "--n-timesteps",
                        help="number of timesteps",
                        default=1000,
                        type=int)
    parser.add_argument(
        "--num-threads",
        help="Number of threads for PyTorch (-1 to use default)",
        default=-1,
        type=int)
    parser.add_argument("--n-envs",
                        help="number of environments",
                        default=1,
                        type=int)
    # parser.add_argument("--exp-id", help="Experiment ID (default: 0: latest, -1: no exp folder)", default=0, type=int)
    parser.add_argument(
        "--exp-id",
        help="Experiment ID (default: 0: latest, -1: no exp folder)",
        default='0',
        type=str)
    parser.add_argument("--verbose",
                        help="Verbose mode (0: no output, 1: INFO)",
                        default=1,
                        type=int)
    parser.add_argument(
        "--no-render",
        action="store_true",
        default=False,
        help="Do not render the environment (useful for tests)")
    parser.add_argument(
        "--render-mode",
        default='step',
        help="Whether to render at each step or at the end of an episode")
    parser.add_argument("--deterministic",
                        action="store_true",
                        default=False,
                        help="Use deterministic actions")
    parser.add_argument(
        "--load-best",
        action="store_true",
        default=False,
        help="Load best model instead of last model if available")
    parser.add_argument(
        "--load-checkpoint",
        type=int,
        help="Load checkpoint instead of last model if available, "
        "you must pass the number of timesteps corresponding to it",
    )
    parser.add_argument("--stochastic",
                        action="store_true",
                        default=False,
                        help="Use stochastic actions")
    parser.add_argument(
        "--norm-reward",
        action="store_true",
        default=False,
        help="Normalize reward if applicable (trained with VecNormalize)")
    parser.add_argument("--seed",
                        help="Random generator seed",
                        type=int,
                        default=0)
    parser.add_argument("--info-freq",
                        help="Frequency on which info valuers are logged",
                        type=int,
                        default=10)
    parser.add_argument("--reward-log",
                        help="Where to log reward",
                        default="",
                        type=str)
    parser.add_argument(
        "--gym-packages",
        type=str,
        nargs="+",
        default=[],
        help=
        "Additional external Gym environemnt package modules to import (e.g. gym_minigrid)",
    )
    parser.add_argument(
        "--env-kwargs",
        type=str,
        nargs="+",
        action=StoreDict,
        help="Optional keyword argument to pass to the env constructor")
    args = parser.parse_args()

    # Going through custom gym packages to let them register in the global registory
    for env_module in args.gym_packages:
        importlib.import_module(env_module)

    env_id = args.env
    algo = args.algo
    folder = args.folder

    if args.exp_id == '0':
        args.exp_id = get_latest_run_id(os.path.join(folder, algo), env_id)
        print(f"Loading latest experiment, id={args.exp_id}")

    # Sanity checks
    if args.exp_id != '0' and args.exp_id != '-1':
        log_path = os.path.join(folder, algo, f"{env_id}_{args.exp_id}")
    else:
        log_path = os.path.join(folder, algo)

    assert os.path.isdir(log_path), f"The {log_path} folder was not found"

    found = False
    for ext in ["zip"]:
        model_path = os.path.join(log_path, f"{env_id}.{ext}")
        found = os.path.isfile(model_path)
        if found:
            break

    if args.load_best:
        model_path = os.path.join(log_path, "best_model.zip")
        found = os.path.isfile(model_path)

    if args.load_checkpoint is not None:
        model_path = os.path.join(
            log_path, f"rl_model_{args.load_checkpoint}_steps.zip")
        found = os.path.isfile(model_path)

    if not found:
        raise ValueError(
            f"No model found for {algo} on {env_id}, path: {model_path}")
    else:
        print(f"Loading model for {algo} on {env_id}, path: {model_path}")

    off_policy_algos = ["qrdqn", "dqn", "ddpg", "sac", "her", "td3", "tqc"]

    if algo in off_policy_algos:
        args.n_envs = 1

    set_random_seed(args.seed)

    if args.num_threads > 0:
        if args.verbose > 1:
            print(f"Setting torch.num_threads to {args.num_threads}")
        th.set_num_threads(args.num_threads)

    is_atari = ExperimentManager.is_atari(env_id)

    stats_path = os.path.join(log_path, env_id)
    hyperparams, stats_path = get_saved_hyperparams(
        stats_path, norm_reward=args.norm_reward, test_mode=True)

    # load env_kwargs if existing
    env_kwargs = {}
    args_path = os.path.join(log_path, env_id, "args.yml")
    if os.path.isfile(args_path):
        with open(args_path, "r") as f:
            loaded_args = yaml.load(f, Loader=yaml.UnsafeLoader)  # pytype: disable=module-attr
            if loaded_args["env_kwargs"] is not None:
                env_kwargs = loaded_args["env_kwargs"]
    # overwrite with command line arguments
    if args.env_kwargs is not None:
        env_kwargs.update(args.env_kwargs)

    log_dir = args.reward_log if args.reward_log != "" else None

    env = create_test_env(
        env_id,
        n_envs=args.n_envs,
        stats_path=stats_path,
        seed=args.seed,
        log_dir=log_dir,
        should_render=not args.no_render,
        hyperparams=hyperparams,
        env_kwargs=env_kwargs,
    )

    kwargs = dict(seed=args.seed)
    if algo in off_policy_algos:
        # Dummy buffer size as we don't need memory to enjoy the trained agent
        kwargs.update(dict(buffer_size=1))

    # Check if we are running python 3.8+
    # we need to patch saved model under python 3.6/3.7 to load them
    newer_python_version = sys.version_info.major == 3 and sys.version_info.minor >= 8

    custom_objects = {}
    if newer_python_version:
        custom_objects = {
            "learning_rate": 0.0,
            "lr_schedule": lambda _: 0.0,
            "clip_range": lambda _: 0.0,
        }

    model = ALGOS[algo].load(model_path,
                             env=env,
                             custom_objects=custom_objects,
                             **kwargs)

    # tb_path = ''
    # for i in range(0,100000,1):
    #     tb_path = os.path.join(args.tensorboard_log, env_id, algo.upper() + "_" + str(i))
    #     if not os.path.exists(tb_path):
    #         break
    # print("algo=",algo, "  logdir=", tb_path)
    # writer = SummaryWriter(log_dir=tb_path)

    obs = env.reset()

    # Deterministic by default except for atari games
    stochastic = args.stochastic or is_atari and not args.deterministic
    deterministic = not stochastic

    state = None
    episode_reward = 0.0
    episode_rewards, episode_lengths = [], []
    ep_len = 0
    ep_count = 0
    # For HER, monitor success rate
    successes = []
    sbcommon_utils.configure_logger(args.verbose,
                                    os.path.join(args.tensorboard_log, env_id),
                                    algo.upper(),
                                    reset_num_timesteps=True)
    xlsx_logpath = os.path.join(
        args.tensorboard_log,
        env_id) if logger.get_dir() is None else logger.get_dir()
    xlsx_logger = Xlsx_Logger(xlsx_logpath, env_id)
    with open(os.path.join(xlsx_logpath, 'args.yaml'), 'w') as file:
        yaml.dump(args, file)
    fig: plt.Figure = None
    info_freq = args.info_freq
    try:
        for step in range(args.n_timesteps):
            action, state = model.predict(obs,
                                          state=state,
                                          deterministic=deterministic)
            obs, reward, done, infos = env.step(action)
            episode_reward += reward[0]
            ep_len += 1

            if args.n_envs == 1:

                # log info variables to tensorboard
                if (step % info_freq == 0 or done) and type(infos[0]) is dict:
                    if not args.no_render:
                        if not done and args.render_mode == 'step':
                            fig = env.render("human")
                        elif done and args.render_mode == 'episode':
                            fig = env.envs[0].rendered_episode
                    xlsx_logger.set_step_ep(ep_count, step)
                    for key in infos[0]:
                        if key == 'episode' or key == 'terminal_observation' or key == 'render':
                            continue
                        val = infos[0].get(key)
                        logger.record("eval/" + key, val, exclude='stdout')
                        xlsx_logger.log(key, val)
                    if fig is not None:
                        log_fig = logger.Figure(fig, False)
                        logger.record("eval/figure", log_fig, exclude='stdout')
                        # writer.add_scalar("eval/"+key, val, step)
                    logger.dump(step=step)

                # For atari the return reward is not the atari score
                # so we have to get it from the infos dict
                if is_atari and infos is not None and args.verbose >= 1:
                    episode_infos = infos[0].get("episode")
                    if episode_infos is not None:
                        print(f"Atari Episode Score: {episode_infos['r']:.2f}")
                        print("Atari Episode Length", episode_infos["l"])

                if done and not is_atari and args.verbose > 0:
                    # NOTE: for env using VecNormalize, the mean reward
                    # is a normalized reward when `--norm_reward` flag is passed

                    print("Episode #{}, step#{}".format(ep_count, step))
                    print(f"  Episode Reward: {episode_reward:.2f}")
                    print("  Episode Length", ep_len)
                    episode_rewards.append(episode_reward)
                    logger.record("eval/ep_len", ep_len, exclude='stdout')
                    logger.record("eval/ep_reward",
                                  episode_reward,
                                  exclude='stdout')
                    xlsx_logger.log('ep_len', ep_len)
                    xlsx_logger.log('reward', episode_reward)
                    logger.dump(step=step)
                    episode_lengths.append(ep_len)
                    episode_reward = 0.0
                    ep_len = 0
                    ep_count += 1
                    state = None

                # Reset also when the goal is achieved when using HER
                if done and infos[0].get("is_success") is not None:
                    if args.verbose > 1:
                        print("Success?", infos[0].get("is_success", False))

                    if infos[0].get("is_success") is not None:
                        successes.append(infos[0].get("is_success", False))
                        episode_reward, ep_len = 0.0, 0
                        ep_count += 1

            # if (not args.no_render) and args.render_mode=='step':
            #     fig = env.render("human")
            # else:
            #     fig = None

    except KeyboardInterrupt:
        pass

    logger.dump(step=step)
    xlsx_logger.close()

    if args.verbose > 0 and len(successes) > 0:
        print(f"Success rate: {100 * np.mean(successes):.2f}%")

    if args.verbose > 0 and len(episode_rewards) > 0:
        print(f"{len(episode_rewards)} Episodes")
        print(
            f"Mean reward: {np.mean(episode_rewards):.2f} +/- {np.std(episode_rewards):.2f}"
        )

    if args.verbose > 0 and len(episode_lengths) > 0:
        print(
            f"Mean episode length: {np.mean(episode_lengths):.2f} +/- {np.std(episode_lengths):.2f}"
        )

    env.close()
    th.set_num_threads(num_threads)

is_atari = 'NoFrameskip' in env_id

stats_path = os.path.join(log_path, env_id)
hyperparams, stats_path = get_saved_hyperparams(stats_path,
                                                norm_reward=norm_reward,
                                                test_mode=True)
env_kwargs = {} if env_kwargs is None else env_kwargs

log_dir = reward_log if reward_log != '' else None

env = create_test_env(env_id,
                      n_envs=n_envs,
                      stats_path=stats_path,
                      seed=seed,
                      log_dir=log_dir,
                      should_render=not no_render,
                      hyperparams=hyperparams,
                      env_kwargs=env_kwargs)

model = ALGOS[algo].load(model_path, env=env, device=device)

obs = env.reset()

rm = AtariRewardModel(env, device)
rm.load_state_dict(
    th.load(f"../reward-models/BreakoutNoFrameskip-v4-reward_model.pt"))
rm = rm.to(device)

random.seed(0)
np.random.seed(0)
Beispiel #23
0
        with open(args_path, "r") as f:
            # pytype: disable=module-attr
            loaded_args = yaml.load(f, Loader=yaml.UnsafeLoader)
            if loaded_args["env_kwargs"] is not None:
                env_kwargs = loaded_args["env_kwargs"]
    # overwrite with command line arguments
    if args.env_kwargs is not None:
        env_kwargs.update(args.env_kwargs)

    log_dir = args.reward_log if args.reward_log != "" else None

    env = create_test_env(
        env_id,
        n_envs=args.n_envs,
        stats_path=stats_path,
        seed=args.seed,
        log_dir=log_dir,
        should_render=args.render,
        hyperparams=hyperparams,
        env_kwargs=env_kwargs,
    )

    kwargs = dict(seed=args.seed)
    if algo in off_policy_algos:
        # Dummy buffer size as we don't need memory to enjoy the trained agent
        kwargs.update(dict(buffer_size=1))

    model = ALGOS[algo].load(model_path, env=env, **kwargs)

    obs = env.reset()

    # Force deterministic for DQN, DDPG, SAC and HER (that is a wrapper around)