Esempio n. 1
0
 def _init():
     env = ObstacleTowerEnv('./ObstacleTower/obstacletower', worker_id=seed+cpu,
                            retro=True, config={'total-floors': 10}, greyscale=True, timeout_wait=600)
     env._flattener = ActionFlattener([2, 3, 2, 1])
     env._action_space = env._flattener.action_space
     env = Monitor(env, sub_dir)
     return env
Esempio n. 2
0
def instantiate_environment(path, train, evaluate, eval_seeds=[1001]):
    env = None
    if train:
        env = ObstacleTowerEnv(path,
                               worker_id=0,
                               retro=False,
                               realtime_mode=False,
                               greyscale=False,
                               config=train_env_reset_config)
    else:
        if evaluate:
            env = ObstacleTowerEnv(path,
                                   worker_id=0,
                                   retro=False,
                                   realtime_mode=False,
                                   greyscale=False,
                                   config=eval_env_reset_config)
            env = ObstacleTowerEvaluation(env, eval_seeds)
        else:  # play a single game
            env = ObstacleTowerEnv(path,
                                   worker_id=0,
                                   retro=False,
                                   realtime_mode=True,
                                   greyscale=False,
                                   config=eval_env_reset_config)

    return env
    def __init__(
        self,
        environment_filename=None,
        docker_training=False,
        worker_id=0,
        retro=False,
        timeout_wait=3000,
        realtime_mode=False,
        num_actions=3,
        stack_size=4,
        mobilenet=False,
        gray_scale=False,
        floor=0,
        visual_theme=0
        ):
        '''
        Arguments:
          environment_filename: The file path to the Unity executable.  Does not require the extension.
          docker_training: Whether this is running within a docker environment and should use a virtual
            frame buffer (xvfb).
          worker_id: The index of the worker in the case where multiple environments are running.  Each
            environment reserves port (5005 + worker_id) for communication with the Unity executable.
          retro: Resize visual observation to 84x84 (int8) and flattens action space.
          timeout_wait: Time for python interface to wait for environment to connect.
          realtime_mode: Whether to render the environment window image and run environment at realtime.
        '''

        self._obstacle_tower_env = ObstacleTowerEnv(environment_filename,
                                                    docker_training,
                                                    worker_id,
                                                    retro,
                                                    timeout_wait,
                                                    realtime_mode)
        if floor is not 0:
            self._obstacle_tower_env.floor(floor)
        self.start_floor = floor
        self.current_floor = floor

        self.mobilenet = mobilenet
        self.gray_scale = gray_scale
        self.retro = retro
        if mobilenet:
            self.state_size = [1280]
        elif gray_scale:
            self.state_size = [84, 84, 1]
        elif retro:
            self.state_size = [84, 84, 3]
        else:
            self.state_size = [168, 168, 3]

        self.stack_size = stack_size
        self.stack = [np.random.random(self.state_size).astype(np.float32) for _ in range(self.stack_size)]
        self.total_reward = 0
        self.current_reward = 0
        self.max_floor = 25
        self.visual_theme = visual_theme

        self.id = worker_id
Esempio n. 4
0
def make_env_all_params(rank, args):
    from time import sleep
    sleep_time = rank
    sleep_multiple = args.pause
    sleep(sleep_multiple * sleep_time)

    show_obs = rank == 0

    rank = args.port_offset + rank

    # handle port clashes
    if rank >= 35:
        rank += 1
    
    environment_path = args.environment_filename
    
    env = ObstacleTowerEnv(
        environment_path,
        worker_id=rank,
        timeout_wait=6000,
        retro=True,
        realtime_mode=False)
    if show_obs:
        env = RenderObservations(env)
    return env
Esempio n. 5
0
def create_env(env_filename,
               custom=True,
               large=False,
               custom_reward=True,
               skip_frames=0,
               docker=False,
               realtime=False,
               random_aug=0.,
               worker_id=0,
               device='cpu'):

    if custom:
        env = CustomObstacleTowerEnv(env_filename,
                                     mode='retro_large' if large else 'retro',
                                     custom_reward=custom_reward,
                                     docker_training=docker,
                                     realtime_mode=realtime,
                                     worker_id=worker_id,
                                     timeout_wait=60)
    else:
        env = ObstacleTowerEnv(env_filename,
                               docker_training=docker,
                               realtime_mode=realtime,
                               worker_id=worker_id,
                               timeout_wait=60)
    if skip_frames > 1:
        env = SkipFrames(env, skip=skip_frames)
    if random_aug > 0.:
        env = ToTorchTensorsWithAug(env, device=device, aug_prob=random_aug)
    else:
        env = ToTorchTensors(env, device=device)

    return env
Esempio n. 6
0
 def _thunk():
     env = ObstacleTowerEnv(env_directory,
                            worker_id=rank,
                            realtime_mode=True)
     env = Monitor(
         env,
         logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
     return env
def make_obstacle_tower(worker_id=0):
    from obstacle_tower_env import ObstacleTowerEnv

    env = ObstacleTowerEnv(
        './obstacle-tower-env/ObstacleTower/ObstacleTower.x86_64',
        retro=True,
        realtime_mode=True,
        worker_id=worker_id)
    return env
def obstacle_test_env_factory(index_worker=0,
                              rank=0,
                              frame_skip=0,
                              frame_stack=1,
                              realtime=False,
                              min_floor=0,
                              max_floor=50,
                              reduced_actions=True):
    """
    Create test Obstacle Tower Unity3D environment.
    Useful info_keywords 'floor', 'start', 'seed'.

    Parameters
    ----------
    frame_skip : int
        Return only every `frame_skip`-th observation.
    frame_stack : int
        Observations composed of last `frame_stack` frames stacked.
    min_floor : int
        Minimum floor the agent can be spawned in.
    max_floor : int
        Maximum floor the agent can be spawned in.
    reduced_actions : bool
        Whether or not to use the action wrapper to reduce the number of available actions.

    Returns
    -------
    env : gym.Env
        Train environment.
    """

    if 'DISPLAY' not in os.environ.keys():
        os.environ['DISPLAY'] = ':0'

    exe = os.path.join(os.path.dirname(obstacle_tower_env.__file__),
                       'ObstacleTower/obstacletower')

    env = ObstacleTowerEnv(environment_filename=exe,
                           retro=True,
                           worker_id=index_worker + rank +
                           np.random.randint(1, 10000),
                           greyscale=False,
                           docker_training=False,
                           realtime_mode=realtime)

    if reduced_actions:
        env = ReducedActionEnv(env)

    env = BasicObstacleEnvTest(env, max_floor=max_floor, min_floor=min_floor)

    if frame_skip > 0:
        env = FrameSkip(env, skip=frame_skip)

    if frame_stack > 1:
        env = FrameStack(env, k=frame_stack)

    return env
    def __init__(self,
                 environment_filename=None,
                 docker_training=False,
                 worker_id=0,
                 retro=False,
                 timeout_wait=30,
                 realtime_mode=False,
                 num_actions=3,
                 mobilenet=False,
                 gray_scale=False,
                 autoencoder=None,
                 floor=0):
        '''
        Arguments:
          environment_filename: The file path to the Unity executable.  Does not require the extension.
          docker_training: Whether this is running within a docker environment and should use a virtual
            frame buffer (xvfb).
          worker_id: The index of the worker in the case where multiple environments are running.  Each
            environment reserves port (5005 + worker_id) for communication with the Unity executable.
          retro: Resize visual observation to 84x84 (int8) and flattens action space.
          timeout_wait: Time for python interface to wait for environment to connect.
          realtime_mode: Whether to render the environment window image and run environment at realtime.
        '''

        self._obstacle_tower_env = ObstacleTowerEnv(environment_filename,
                                                    docker_training, worker_id,
                                                    retro, timeout_wait,
                                                    realtime_mode)
        if floor != 0:
            self._obstacle_tower_env.floor(floor)
        self._flattener = ActionFlattener([3, 3, 2, 3])
        self._action_space = self._flattener.action_space
        self.mobilenet = mobilenet
        self.gray_scale = gray_scale
        if mobilenet:
            self.image_module = WrappedKerasLayer(retro, self.mobilenet)
        self._done = False
        if autoencoder:
            print("Loading autoencoder from {}".format(autoencoder))
            self.autoencoder = build_autoencoder(autoencoder)
            print("Done.")
        else:
            self.autoencoder = None
Esempio n. 10
0
def make_env(env_id,
             rank,
             env_filename='./ObstacleTower/obstacletower',
             docker_training=False,
             realtime=False):
    env = ObstacleTowerEnv(env_filename,
                           docker_training=docker_training,
                           worker_id=rank,
                           realtime_mode=realtime)
    return env
Esempio n. 11
0
def create_otc_environment(environment_path=None):
    """Wraps an Obstacle Tower Gym environment with some basic preprocessing.

  Returns:
    An Obstacle Tower environment with some standard preprocessing.
  """
    assert environment_path is not None
    env = ObstacleTowerEnv(environment_path, 0, retro=True)
    env = OTCPreprocessing(env)
    return env
Esempio n. 12
0
def create_otc_environment(environment_path=None, docker_training=False):
  """Wraps an Obstacle Tower Gym environment with some basic preprocessing.

  Returns:
    An Obstacle Tower environment with some standard preprocessing.
  """
  assert environment_path is not None
  config = {'agent-perspective': 0}
  config = None
  env = ObstacleTowerEnv(retro=True, realtime_mode=False)
  env = OTCPreprocessing(env)
  return env
 def testing(self):
     from keepitpossible.common import action_table
     self.table_action = action_table.create_action_table()
     self.MODEL.load()
     done = False
     reward = 0.0
     env = ObstacleTowerEnv(environment_filename=self.SCHEDULE.ENV_PATH,
                            worker_id=self.SCHEDULE.N_WORKER + 1,
                            retro=False,
                            realtime_mode=True)
     obs = env.reset()
     previous_preprocessed_observation_image = obs[0]
     while not done:
         action = self.MODEL.choose_action(
             previous_preprocessed_observation_image)
         # 做出動作,獲得場景資訊,已過關數,代理資訊
         for _ in self.table_action[int(action)]:
             observation, reward, done, info = env.step(_)
             print(
                 "Action_Chose: ",
                 action,
                 "Action: ",
                 _,
                 " Reward: ",
                 reward)
             if done:
                 break
         # 預處理模型需要的資料
         observation_image, keys, time_remaining = observation
         preprocessed_observation_image = observation_image
         previous_preprocessed_observation_image = preprocessed_observation_image
     env.close()
    def __init__(self, result_queue, idx, save_dir, params):
        super(Worker, self).__init__()
        self.result_queue = result_queue
        self.worker_idx = idx
        self.save_dir = save_dir
        self.model_path = os.path.join(self.save_dir, 'model_a3c')

        self.env = ObstacleTowerEnv(params['env_path'],
                                    worker_id=self.worker_idx,
                                    retro=False,
                                    realtime_mode=False,
                                    greyscale=False,
                                    config=train_env_reset_config)

        self.action_size = params['action_size']
        self._action_lookup = params['action_lookup']
        self.input_shape = self.env.observation_space[0].shape  # (84, 84, 3)
        self._last_health = 99999.
        self._last_keys = 0

        self.global_model = params['global_model']
        # self.local_model = CNN(self.action_size, self.input_shape)
        self.local_model = CnnGru(self.action_size, self.input_shape)

        self.ac_ckpt = params['ckpt']
        self.ac_manager = params['ckpt_mgr']

        self.current_time = params['log_timestamp']
        train_log_dir = './logs/' + self.current_time + '/worker_' + str(
            self.worker_idx)
        self.worker_summary_writer = tf.summary.create_file_writer(
            train_log_dir)

        self.timesteps = params['timesteps']
        self.batch_size = params['batch_size']
        self.gamma = params['gamma']
        self.lr = params['lr']
        self.opt = params['optimizer']
        self.eps = np.finfo(np.float32).eps.item()
Esempio n. 15
0
 def __init__(self,
              envpath,
              wid,
              retro,
              realtime_mode,
              env_seed=0,
              env_floor=0):
     self.wid = wid
     self.env = ObstacleTowerEnv(environment_filename=envpath,
                                 worker_id=wid,
                                 retro=retro,
                                 realtime_mode=realtime_mode)
     self.kprun = GLOBAL_KPRUN
     self.tableAction = self.createActionTable()
     # 設定關卡
     self.env_seed = env_seed
     self.env_floor = env_floor
     self.step = 0
     self.summary = tf.Summary(value=[
         tf.Summary.Value(tag="Stage_reward " + str(self.wid),
                          simple_value=0)
     ])
     self.kprun.train_writer.add_summary(self.summary, 0)
Esempio n. 16
0
def create_otc_environment(environment_filename=None,
                           docker_training=False,
                           worker_id=0,
                           retro=True,
                           timeout_wait=30,
                           realtime_mode=False):
    env = ObstacleTowerEnv(environment_filename=environment_filename,
                           docker_training=docker_training,
                           worker_id=worker_id,
                           retro=retro,
                           timeout_wait=timeout_wait,
                           realtime_mode=realtime_mode)
    env = OtcPreprocessing(env)
    return env
Esempio n. 17
0
def parse_unity_environment(env_name):
    '''
    Generates a regym.environments.Task generated by creating a Unity Environment
    (mlagents-envs) and extracting data from the environment.

    :param env_name: Path to Unity Executable
    :returns: Task created from :param: env_name
    '''
    if 'obstacletower' not in env_name:
        raise ValueError('Only obstacletower environment currently supported')
    from obstacle_tower_env import ObstacleTowerEnv
    env = ObstacleTowerEnv(
        env_name, retro=True, realtime_mode=False
    )  # retro=True mode creates an observation space of a 64x64 (Box) image
    return parse_gym_environment(env)
Esempio n. 18
0
 def _thunk():
     env = ObstacleTowerEnv('../ObstacleTower/obstacletower',
                            retro=True,
                            worker_id=rank,
                            realtime_mode=show,
                            config={'total-floors': 20})
     env.seed(seed + rank % 8)
     env = bench.Monitor(env, None, allow_early_resets=True)
     env = OTWrapper(env)
     env = FrameStack(env, 4)
     return env
Esempio n. 19
0
def main():
    if len(sys.argv) != 2:
        sys.stderr.write('Usage: python record_tail.py <start_floor>\n')
        sys.exit(1)
    start_floor = int(sys.argv[1])
    viewer = EnvInteractor()
    env = ObstacleTowerEnv(os.environ['OBS_TOWER_PATH'],
                           worker_id=random.randrange(11, 20))
    while True:
        seed = select_seed(floor=start_floor)
        env.seed(seed)
        env.floor(start_floor)
        obs = env.reset()
        viewer.reset()
        record_episode(seed, env, viewer, obs, max_steps=MAX_STEPS)
    def __init__(self,
                 env_path,
                 train,
                 evaluate,
                 policy_name='CnnPolicy',
                 save_dir='./model_files/',
                 eval_seeds=[],
                 reduced_action=False):
        self.save_dir = save_dir
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        self.model_path = os.path.join(self.save_dir, 'model_stable_ppo')
        self.log_dir = './logs/stable_ppo'
        self.policy_name = policy_name
        self.evaluate = evaluate
        print(env_path)
        if reduced_action:
            from obstacle_tower_env import ObstacleTowerEnv, ObstacleTowerEvaluation
            from models.common.constants import train_env_reset_config_industrial as train_env_reset_config
            from models.common.constants import eval_env_reset_config_industrial as eval_env_reset_config
        else:
            from models.stable_baselines.reduced_action_env import ObstacleTowerEnv, ObstacleTowerEvaluation
            from models.common.constants import train_env_reset_config, eval_env_reset_config

        if train:
            self.env = ObstacleTowerEnv(env_path,
                                        worker_id=0,
                                        retro=True,
                                        realtime_mode=False,
                                        config=train_env_reset_config)
        else:
            if evaluate:
                self.env = ObstacleTowerEnv(env_path,
                                            worker_id=0,
                                            retro=True,
                                            realtime_mode=False,
                                            config=eval_env_reset_config)
                self.env = ObstacleTowerEvaluation(self.env, eval_seeds)
            else:
                self.env = ObstacleTowerEnv(env_path,
                                            worker_id=0,
                                            retro=True,
                                            realtime_mode=True,
                                            config=eval_env_reset_config)
Esempio n. 21
0
    def __init__(self, params):
        # Gives common variables to all environments
        super().__init__()

        try:
            from obstacle_tower_env import ObstacleTowerEnv
        except:
            print(
                "Failed to import ObstacleTowerEnv, make sure you have Obstacle Tower installed!"
            )

        # Handle Parameters
        env_name = params['env_name']

        # Create GYM instance
        env = ObstacleTowerEnv('./ObstacleTower/obstacletower', retro=False)
        self.action_space = self.env.action_space
        self.observation_space = self.env.observation_space

        # Store for later
        self.disc = gym.spaces.discrete.Discrete

        # Define header
        #TODO: Check all open ai gym envs to see if action space works the same
        #       Workout num_classes based on action_space type
        if type(self.observation_space) == self.disc:
            self.out = [self.observation_space.n]
        else:
            self.out = list(self.observation_space.shape)

        self.header = header(env_name=env_name,
                             input_dim=self.action_space.n,
                             output_dim=self.out,
                             num_classes=2,
                             info="",
                             env_min_score=0.0,
                             env_max_score=200.0,
                             rl=True)
def seed_hashes():
    mapping = {}
    while len(mapping) < 100:
        if os.path.exists('UnitySDK.log'):
            os.remove('UnitySDK.log')
        while True:
            try:
                env = ObstacleTowerEnv(os.environ['OBS_TOWER_PATH'],
                                       worker_id=random.randrange(1000))
                break
            except KeyboardInterrupt:
                sys.exit(1)
            except:
                pass
        env.seed(25)  # random argument
        obs = env.reset()
        env.close()
        with open('UnitySDK.log') as f:
            contents = next(l for l in f.readlines() if 'seed:' in l)
        seed = int(contents.split(': ')[-1])
        yield str(obs.flatten().tolist()), seed
    return mapping
Esempio n. 23
0
    def __init__(self,
                 env_path,
                 train,
                 evaluate,
                 policy_name='CnnPolicy',
                 save_dir='./model_files/',
                 eval_seeds=[]):
        self.save_dir = save_dir
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        self.model_path = os.path.join(self.save_dir, 'model_stable_a2c')
        self.log_dir = './logs/stable_a2c'
        self.policy_name = policy_name
        self.evaluate = evaluate

        if train:
            self.env = ObstacleTowerEnv(env_path,
                                        worker_id=0,
                                        retro=True,
                                        realtime_mode=False,
                                        config=train_env_reset_config)
        else:
            if evaluate:
                self.env = ObstacleTowerEnv(env_path,
                                            worker_id=0,
                                            retro=True,
                                            realtime_mode=False,
                                            config=eval_env_reset_config)
                self.env = ObstacleTowerEvaluation(self.env, eval_seeds)
            else:
                self.env = ObstacleTowerEnv(env_path,
                                            worker_id=0,
                                            retro=True,
                                            realtime_mode=True,
                                            config=eval_env_reset_config)
    def __init__(self,
                 env_path,
                 train=False,
                 evaluate=False,
                 eval_seeds=[],
                 max_eps=100,
                 save_dir=None,
                 plot=False):
        if train:
            self.env = ObstacleTowerEnv(env_path,
                                        worker_id=0,
                                        retro=False,
                                        realtime_mode=False,
                                        config=train_env_reset_config)
        else:
            if evaluate:
                self.env = ObstacleTowerEnv(env_path,
                                            worker_id=0,
                                            retro=False,
                                            realtime_mode=False,
                                            config=eval_env_reset_config)
                self.env = ObstacleTowerEvaluation(self.env, eval_seeds)
            else:
                self.env = ObstacleTowerEnv(env_path,
                                            worker_id=0,
                                            retro=False,
                                            realtime_mode=True,
                                            config=eval_env_reset_config)
        self.max_episodes = max_eps
        self.global_moving_average_reward = 0
        self.save_dir = save_dir
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        self.plot = plot
        self.res_queue = Queue()
Esempio n. 25
0
import os

from obstacle_tower_env import ObstacleTowerEnv

env = ObstacleTowerEnv(os.environ['OBS_TOWER_PATH'], worker_id=0)

env.seed(72)
env.floor(12)
env.reset()
for action in [
        18, 18, 18, 18, 18, 18, 30, 24, 24, 21, 18, 18, 30, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 24, 18, 30, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 18, 30, 30, 30, 30, 24, 24, 6, 6, 6, 6, 6,
        6, 6, 6, 30, 30, 30, 30, 30, 18, 24, 24, 24, 6, 6, 6, 6, 6, 6, 24, 18,
        24, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 18, 18, 6, 6, 6, 6, 24, 24, 24, 18, 30, 18,
        18, 30, 18, 30, 30, 18, 18, 18, 18, 18, 18, 18, 18, 30, 24, 24, 30, 30,
        24, 24, 24, 30, 30, 30, 30, 30, 18, 18, 18, 18, 30, 30, 30, 30, 30, 30,
        30, 30, 30, 30, 30, 30, 30, 30, 24, 24, 24, 24, 24, 24, 24, 18, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 18, 24, 18, 18, 30, 18, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
        18, 18, 24, 18, 30, 18, 18, 18, 18, 30, 30, 30, 18, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 18, 30, 18, 18, 30, 18, 18, 18, 18, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 30, 24, 24, 24, 24, 24, 24, 24, 24, 18, 30, 18, 18, 18,
        18, 30, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 30,
        30, 30, 30, 30, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
        18, 18, 30, 24, 21, 18, 24, 24, 24, 24, 18, 18, 18, 24, 18, 18, 18, 18,
        30, 18, 18, 24, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
        24, 24, 24, 24, 24, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
        18, 18, 30, 30, 30, 18, 18, 30, 30, 30, 30, 30, 30, 12, 12, 30, 30, 30,
        30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 18, 18, 18, 18, 18, 18, 18, 18,
Esempio n. 26
0
from obstacle_tower_env import ObstacleTowerEnv
import numpy as np
from matplotlib import pyplot as plt

# Realtime mode determines whether the environment window will render the scene,
# as well as whether the environment will run at realtime speed. Set this to `True`
# to visual the agent behavior as you would in player mode.

env = ObstacleTowerEnv('./ObstacleTower/obstacletower', retro=False, realtime_mode=True)

# The environment provided has a MultiDiscrete action space, where the 4 dimensions are:

# 0. Movement (No-Op/Forward/Back)
# 1. Camera Rotation (No-Op/Counter-Clockwise/Clockwise)
# 2. Jump (No-Op/Jump)
# 3. Movement (No-Op/Right/Left)

print(env.action_space.nvec)
print(env.observation_space)


#plt.imshow(obs[0])
#plt.show()
#print(env.unwrapped.get_action_meanings())


# tower 0, floor 10 = second room holds key
config = {'tower-seed': 0, 'starting-floor': 10, 'agent-perspective': 0, 'allowed-rooms': 1, 'allowed-modules': 0, 'allowed-floors': 0}
obs = env.reset(config=config)

action = env.action_space.sample()
Esempio n. 27
0
def main():
    #Load parse parameters
    #parser = otc_arg_parser()
    #args = parser.parse_args()

    #Challenge environment
    # if args.env == 'ObtRetro-v6':
    #     env = ObstacleTowerEnv(
    #         '/home/home/Data/Carmen/py_workspace/ObstacleTower_v3/ObstacleTower-v3.1/obstacletower.x86_64',
    #         timeout_wait=6000,
    #         retro=args.retro,
    #         realtime_mode=args.test)
    #     env = RetroWrapper(env, args.sample_normal)
    #     env = OTCPreprocessing(env, args.action_reduction)
    #     # if show_obs:
    #     #     env = RenderObservations(env)
    #     #     env = KeyboardControlWrapper(env)
    # else:
    env = ObstacleTowerEnv(
        '/home/home/Data/Carmen/py_workspace/ObstacleTower_v3/ObstacleTower-v3.1/obstacletower.x86_64',
        retro=args.retro,
        realtime_mode=args.test,
        timeout_wait=6000)

    #env = ObstacleTowerEnv('OBSTACLE_TOWER_PATH', retro=args.retro, realtime_mode=args.test, timeout_wait=6000)

    #Dict of actions created by the ObstacleTowerEnv Class of obstacle_tower_env library
    #print("ACTIONS:", env._flattener.action_lookup)

    print('FEATURES :', args.features)

    #Preprocess the environment (Grey Scales and action space reduction)
    env = OTCPreprocessing(env, args.action_reduction, args.features)
    env = DummyVecEnv([lambda: env])
    #env = VecEnv(1, env.observation_space, env.action_space)

    print("ACTION SPACE  ///////////:", env.action_space)
    print("OBSERVATION SPACE ///////////////:", env.observation_space)
    #env = make_vec_env(env, n_envs=4)

    ########Training########

    #Study of the impact of different values of the PPO params
    if args.study:
        params_test(MlpPolicy, env)

    #If no Study Mode
    else:
        #If no Test Mode
        if not args.test:

            seed = random.seed(0)

            if args.pretrained_model:

                t = 300000

                model = PPO2.load(args.pretrained_model,
                                  env=env,
                                  tensorboard_log=args.tensorboard_logdir)

            else:

                t = 0

                #If Generalized Advantage Estimator is used
                if args.use_gae:

                    model = PPO2(MlpPolicy,
                                 env,
                                 n_steps=args.num_steps,
                                 verbose=1,
                                 tensorboard_log=args.tensorboard_logdir,
                                 cliprange=args.clip_param,
                                 learning_rate=args.lr,
                                 ent_coef=args.entropy_coef,
                                 vf_coef=args.value_loss_coef,
                                 max_grad_norm=args.max_grad_norm,
                                 gamma=args.gamma,
                                 lam=args.gae_lambda,
                                 noptepochs=args.ppo_epoch,
                                 seed=seed)

                #If Generalized Advantage Estimator is not used
                else:

                    model = PPO2(MlpPolicy,
                                 env,
                                 n_steps=args.num_steps,
                                 verbose=1,
                                 tensorboard_log=args.tensorboard_logdir,
                                 cliprange=args.clip_param,
                                 learning_rate=args.lr,
                                 ent_coef=args.entropy_coef,
                                 vf_coef=args.value_loss_coef,
                                 max_grad_norm=args.max_grad_norm,
                                 gamma=args.gamma,
                                 noptepochs=args.ppo_epoch,
                                 seed=seed)
        else:

            model = PPO2.load(args.pretrained_model, env=env)

        #model.learn(total_timesteps=50000)
        #model.save("ObstacleTower_prueba")

        filename = 'argsparams.txt'
        os.makedirs(args.results_dir, exist_ok=True)
        myfile = open(args.results_dir + filename, 'a')
        myfile.write(
            'clip range: %f \n learning rate: %f \n coeficiente de entropía: %f \n coeficiente de pérdida: %f \n '
            'máximo gradiente: %f \n gamma: %f \n ppo epoch: %f \n' %
            (args.clip_param, args.lr, args.entropy_coef, args.value_loss_coef,
             args.max_grad_norm, args.gamma, args.ppo_epoch))
        myfile.close()

        if not args.test:
            while t < args.num_env_steps:
                #TRAIN MODEL
                if t == 0:
                    model.learn(total_timesteps=args.eval_interval)

                else:
                    model.learn(total_timesteps=args.eval_interval,
                                reset_num_timesteps=False)

                os.makedirs(GLOBAL_PATH, exist_ok=True)
                print("Saving in '" + GLOBAL_PATH + "'")
                model.save(GLOBAL_PATH + args.training_name + "_" +
                           str(int(t)).zfill(10))

                avg_reward, avg_floor = test(
                    t, model, env=env, global_path=args.results_dir)  # Test
                log('T = ' + str(t) + ' / ' + str(args.num_env_steps) +
                    ' | Avg. reward: ' + str(avg_reward) + ' | Avg. floor: ' +
                    str(avg_floor))

                t += args.eval_interval
        else:
            obs = env.reset()
            t = 0
            while t < args.num_env_steps:

                action, _states = model.predict(obs)
                obs, rewards, done, info = env.step(action)
                #print('action :', info)
                env.render('rgb_array')
class Worker(threading.Thread):
    episode_count = 0
    mean_reward = 0
    best_score = 0
    global_steps = 0
    save_lock = threading.Lock()

    def __init__(self, result_queue, idx, save_dir, params):
        super(Worker, self).__init__()
        self.result_queue = result_queue
        self.worker_idx = idx
        self.save_dir = save_dir
        self.model_path = os.path.join(self.save_dir, 'model_a3c')

        self.env = ObstacleTowerEnv(params['env_path'],
                                    worker_id=self.worker_idx,
                                    retro=False,
                                    realtime_mode=False,
                                    greyscale=False,
                                    config=train_env_reset_config)

        self.action_size = params['action_size']
        self._action_lookup = params['action_lookup']
        self.input_shape = self.env.observation_space[0].shape  # (84, 84, 3)
        self._last_health = 99999.
        self._last_keys = 0

        self.global_model = params['global_model']
        # self.local_model = CNN(self.action_size, self.input_shape)
        self.local_model = CnnGru(self.action_size, self.input_shape)

        self.ac_ckpt = params['ckpt']
        self.ac_manager = params['ckpt_mgr']

        self.current_time = params['log_timestamp']
        train_log_dir = './logs/' + self.current_time + '/worker_' + str(
            self.worker_idx)
        self.worker_summary_writer = tf.summary.create_file_writer(
            train_log_dir)

        self.timesteps = params['timesteps']
        self.batch_size = params['batch_size']
        self.gamma = params['gamma']
        self.lr = params['lr']
        self.opt = params['optimizer']
        self.eps = np.finfo(np.float32).eps.item()

    def get_updated_reward(self, reward, new_health, new_keys, done):
        new_health = float(new_health)
        new_reward = 0.0
        if done:  # reset params when game is terminated
            self._last_health = 99999.
            self._last_keys = 0
        else:
            # opened a door, solved a puzzle, picked up a key
            if 0.1 <= reward < 1:
                new_reward += 0.5

            # crossing a floor - between [1, 4]
            if reward >= 1:
                new_reward += (new_health / 10000)

            # found time orb / crossed a floor
            if new_health > self._last_health:
                new_reward += 0.5

        return new_reward

    def log_worker_metrics(self, episode_reward, loss, step):
        with self.worker_summary_writer.as_default():
            with tf.name_scope('worker'):
                tf.summary.scalar('reward', episode_reward, step=step)
                tf.summary.scalar('loss', loss, step=step)
            self.worker_summary_writer.flush()

    def run(self):
        mem = Memory()
        ep_count = 0
        timestep = 0
        entropy_term = 0
        ep_reward = 0.
        ep_steps = 0
        ep_loss = 0.

        done = False
        obs = self.env.reset()
        state, self._last_keys, self._last_health, _ = obs

        while timestep <= self.timesteps:
            i = 0
            with tf.GradientTape() as tape:
                while i < self.batch_size:
                    # collect experience
                    # get action as per policy
                    state = tf.convert_to_tensor(state)
                    state = tf.expand_dims(state, axis=0)
                    action_probs, critic_value = self.local_model(
                        [state, float(self._last_health)], training=True)

                    entropy = -np.sum(action_probs * np.log(action_probs))
                    entropy_term += entropy

                    # choose most probable action
                    dist = tfp.distributions.Categorical(probs=action_probs,
                                                         dtype=tf.float32)
                    action_index = int(dist.sample().numpy())
                    action = self._action_lookup[action_index]

                    # perform action in game env
                    for i in range(4):  # frame skipping
                        obs, reward, done, _ = self.env.step(action)
                        state, new_keys, new_health, cur_floor = obs
                        reward = self.get_updated_reward(
                            reward, new_health, new_keys, done)
                        self._last_health = new_health
                        self._last_keys = new_keys
                        ep_reward += reward
                        ep_steps += 1
                        i += 1
                        timestep += 1

                    # store experience
                    mem.store(action_prob=tf.math.log(
                        action_probs[0, action_index]),
                              value=critic_value[0, 0],
                              reward=reward)

                    if done:
                        break

                # backpropagation
                total_loss = self.local_model.compute_loss(
                    mem, state, done, self.gamma, self.eps, entropy_term)
                ep_loss += total_loss
                Worker.global_steps += ep_steps

            grads = tape.gradient(total_loss,
                                  self.local_model.trainable_variables
                                  )  # calculate local gradients
            self.opt.apply_gradients(
                zip(grads, self.global_model.trainable_variables)
            )  # send local gradients to global model
            self.local_model.set_weights(self.global_model.get_weights(
            ))  # update local model with new weights
            mem.clear()

            if done:
                Worker.mean_reward = (Worker.mean_reward * Worker.episode_count
                                      + ep_reward) / (Worker.episode_count + 1)

                self.log_worker_metrics(ep_reward, ep_loss, ep_count)
                print(
                    "Episode: {} | Mean Reward: {:.3f} | Episode Reward: {:.3f} | Loss: {:.3f} | Steps: {} | Total Steps: {} | Worker: {}"
                    .format(Worker.episode_count, Worker.mean_reward,
                            ep_reward, ep_loss, ep_steps, Worker.global_steps,
                            self.worker_idx))
                self.result_queue.put((Worker.mean_reward, total_loss))
                Worker.episode_count += 1
                ep_count += 1

                obs = self.env.reset()
                state, _, _, _ = obs

                # use a lock to save local model and to print to prevent data races.
                if ep_reward > Worker.best_score:
                    with Worker.save_lock:
                        self.ac_manager.save()
                        print("Saved checkpoint for step {}".format(
                            int(self.ac_ckpt.step)))
                        self.ac_ckpt.step.assign_add(1)

                        keras.models.save_model(self.global_model,
                                                self.model_path)
                        print('\nSaved best model to: {}, episode score: {}\n'.
                              format(self.model_path, ep_reward))
                        Worker.best_score = ep_reward

                entropy_term = 0
                ep_reward = 0.
                ep_steps = 0
                ep_loss = 0.

        self.result_queue.put(None)
        self.env.close()
Esempio n. 29
0
TEST_STEPS = 2000
TRAINING_INTERVAL_STEPS = 10000
TOTAL_TRAINING_STEPS = 1e12
RESULTS_PATH = "/home/home/Data/Carmen/py_workspace/ObstacleTower/results/" + datetime.now(
).strftime("%B-%d-%Y_%H_%M%p")
TRAINING_NAME = "dqn_train_my_cnn"
AGENT_ALGORITHM = "DQN"  # DDPG, PPO2, TRPO, DQN
PRETRAINED_MODEL = "/home/home/Data/Carmen/py_workspace/ObstacleTower/results/July-30-2019_21_36PM_dqn_train_my_cnn/dqn_train_my_cnn_0000290000.pkl"
#PRETRAINED_MODEL = None
TEST_ONLY = True
NEURONAL_NETWORK = "CNN"  # CNN, MLP

config = {"visual-theme": 2, "dense-reward": 1}

env = ObstacleTowerEnv('./ObstacleTower/obstacletower',
                       retro=True,
                       realtime_mode=True,
                       config=config)
# env.seed(5)
input_shape = env.observation_space.shape
print("OBS", env.observation_space)
#print("OBS", env.observation_space[0])
print("INPUT SHAPE", input_shape)
env = DummyVecEnv([lambda: env])

# Create global experiments path
if not TEST_ONLY:
    global_path = RESULTS_PATH + "_" + TRAINING_NAME + "/"
else:
    global_path = RESULTS_PATH + "_" + TRAINING_NAME + "_test" + "/"

os.makedirs(global_path, exist_ok=True)
Esempio n. 30
0
            steps = []
    return episode_reward


def run_evaluation(env):
    while not env.done_grading():
        run_episode(env)
        env.reset()


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('environment_filename',
                        default='./ObstacleTower/obstacletower',
                        nargs='?')
    parser.add_argument('--docker_training', action='store_true')
    parser.set_defaults(docker_training=False)
    args = parser.parse_args()
    env = ObstacleTowerEnv(args.environment_filename,
                           docker_training=args.docker_training,
                           realtime_mode=True)
    net = get_model()
    optimizer = tf.train.AdamOptimizer()
    if env.is_grading():
        episode_reward = run_evaluation(env)
    else:
        while True:
            episode_reward = run_episode(env)
            print("Episode reward: " + str(episode_reward))

    env.close()