def _init(): env = ObstacleTowerEnv('./ObstacleTower/obstacletower', worker_id=seed+cpu, retro=True, config={'total-floors': 10}, greyscale=True, timeout_wait=600) env._flattener = ActionFlattener([2, 3, 2, 1]) env._action_space = env._flattener.action_space env = Monitor(env, sub_dir) return env
def instantiate_environment(path, train, evaluate, eval_seeds=[1001]): env = None if train: env = ObstacleTowerEnv(path, worker_id=0, retro=False, realtime_mode=False, greyscale=False, config=train_env_reset_config) else: if evaluate: env = ObstacleTowerEnv(path, worker_id=0, retro=False, realtime_mode=False, greyscale=False, config=eval_env_reset_config) env = ObstacleTowerEvaluation(env, eval_seeds) else: # play a single game env = ObstacleTowerEnv(path, worker_id=0, retro=False, realtime_mode=True, greyscale=False, config=eval_env_reset_config) return env
def __init__( self, environment_filename=None, docker_training=False, worker_id=0, retro=False, timeout_wait=3000, realtime_mode=False, num_actions=3, stack_size=4, mobilenet=False, gray_scale=False, floor=0, visual_theme=0 ): ''' Arguments: environment_filename: The file path to the Unity executable. Does not require the extension. docker_training: Whether this is running within a docker environment and should use a virtual frame buffer (xvfb). worker_id: The index of the worker in the case where multiple environments are running. Each environment reserves port (5005 + worker_id) for communication with the Unity executable. retro: Resize visual observation to 84x84 (int8) and flattens action space. timeout_wait: Time for python interface to wait for environment to connect. realtime_mode: Whether to render the environment window image and run environment at realtime. ''' self._obstacle_tower_env = ObstacleTowerEnv(environment_filename, docker_training, worker_id, retro, timeout_wait, realtime_mode) if floor is not 0: self._obstacle_tower_env.floor(floor) self.start_floor = floor self.current_floor = floor self.mobilenet = mobilenet self.gray_scale = gray_scale self.retro = retro if mobilenet: self.state_size = [1280] elif gray_scale: self.state_size = [84, 84, 1] elif retro: self.state_size = [84, 84, 3] else: self.state_size = [168, 168, 3] self.stack_size = stack_size self.stack = [np.random.random(self.state_size).astype(np.float32) for _ in range(self.stack_size)] self.total_reward = 0 self.current_reward = 0 self.max_floor = 25 self.visual_theme = visual_theme self.id = worker_id
def make_env_all_params(rank, args): from time import sleep sleep_time = rank sleep_multiple = args.pause sleep(sleep_multiple * sleep_time) show_obs = rank == 0 rank = args.port_offset + rank # handle port clashes if rank >= 35: rank += 1 environment_path = args.environment_filename env = ObstacleTowerEnv( environment_path, worker_id=rank, timeout_wait=6000, retro=True, realtime_mode=False) if show_obs: env = RenderObservations(env) return env
def create_env(env_filename, custom=True, large=False, custom_reward=True, skip_frames=0, docker=False, realtime=False, random_aug=0., worker_id=0, device='cpu'): if custom: env = CustomObstacleTowerEnv(env_filename, mode='retro_large' if large else 'retro', custom_reward=custom_reward, docker_training=docker, realtime_mode=realtime, worker_id=worker_id, timeout_wait=60) else: env = ObstacleTowerEnv(env_filename, docker_training=docker, realtime_mode=realtime, worker_id=worker_id, timeout_wait=60) if skip_frames > 1: env = SkipFrames(env, skip=skip_frames) if random_aug > 0.: env = ToTorchTensorsWithAug(env, device=device, aug_prob=random_aug) else: env = ToTorchTensors(env, device=device) return env
def _thunk(): env = ObstacleTowerEnv(env_directory, worker_id=rank, realtime_mode=True) env = Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) return env
def make_obstacle_tower(worker_id=0): from obstacle_tower_env import ObstacleTowerEnv env = ObstacleTowerEnv( './obstacle-tower-env/ObstacleTower/ObstacleTower.x86_64', retro=True, realtime_mode=True, worker_id=worker_id) return env
def obstacle_test_env_factory(index_worker=0, rank=0, frame_skip=0, frame_stack=1, realtime=False, min_floor=0, max_floor=50, reduced_actions=True): """ Create test Obstacle Tower Unity3D environment. Useful info_keywords 'floor', 'start', 'seed'. Parameters ---------- frame_skip : int Return only every `frame_skip`-th observation. frame_stack : int Observations composed of last `frame_stack` frames stacked. min_floor : int Minimum floor the agent can be spawned in. max_floor : int Maximum floor the agent can be spawned in. reduced_actions : bool Whether or not to use the action wrapper to reduce the number of available actions. Returns ------- env : gym.Env Train environment. """ if 'DISPLAY' not in os.environ.keys(): os.environ['DISPLAY'] = ':0' exe = os.path.join(os.path.dirname(obstacle_tower_env.__file__), 'ObstacleTower/obstacletower') env = ObstacleTowerEnv(environment_filename=exe, retro=True, worker_id=index_worker + rank + np.random.randint(1, 10000), greyscale=False, docker_training=False, realtime_mode=realtime) if reduced_actions: env = ReducedActionEnv(env) env = BasicObstacleEnvTest(env, max_floor=max_floor, min_floor=min_floor) if frame_skip > 0: env = FrameSkip(env, skip=frame_skip) if frame_stack > 1: env = FrameStack(env, k=frame_stack) return env
def __init__(self, environment_filename=None, docker_training=False, worker_id=0, retro=False, timeout_wait=30, realtime_mode=False, num_actions=3, mobilenet=False, gray_scale=False, autoencoder=None, floor=0): ''' Arguments: environment_filename: The file path to the Unity executable. Does not require the extension. docker_training: Whether this is running within a docker environment and should use a virtual frame buffer (xvfb). worker_id: The index of the worker in the case where multiple environments are running. Each environment reserves port (5005 + worker_id) for communication with the Unity executable. retro: Resize visual observation to 84x84 (int8) and flattens action space. timeout_wait: Time for python interface to wait for environment to connect. realtime_mode: Whether to render the environment window image and run environment at realtime. ''' self._obstacle_tower_env = ObstacleTowerEnv(environment_filename, docker_training, worker_id, retro, timeout_wait, realtime_mode) if floor != 0: self._obstacle_tower_env.floor(floor) self._flattener = ActionFlattener([3, 3, 2, 3]) self._action_space = self._flattener.action_space self.mobilenet = mobilenet self.gray_scale = gray_scale if mobilenet: self.image_module = WrappedKerasLayer(retro, self.mobilenet) self._done = False if autoencoder: print("Loading autoencoder from {}".format(autoencoder)) self.autoencoder = build_autoencoder(autoencoder) print("Done.") else: self.autoencoder = None
def make_env(env_id, rank, env_filename='./ObstacleTower/obstacletower', docker_training=False, realtime=False): env = ObstacleTowerEnv(env_filename, docker_training=docker_training, worker_id=rank, realtime_mode=realtime) return env
def create_otc_environment(environment_path=None): """Wraps an Obstacle Tower Gym environment with some basic preprocessing. Returns: An Obstacle Tower environment with some standard preprocessing. """ assert environment_path is not None env = ObstacleTowerEnv(environment_path, 0, retro=True) env = OTCPreprocessing(env) return env
def create_otc_environment(environment_path=None, docker_training=False): """Wraps an Obstacle Tower Gym environment with some basic preprocessing. Returns: An Obstacle Tower environment with some standard preprocessing. """ assert environment_path is not None config = {'agent-perspective': 0} config = None env = ObstacleTowerEnv(retro=True, realtime_mode=False) env = OTCPreprocessing(env) return env
def testing(self): from keepitpossible.common import action_table self.table_action = action_table.create_action_table() self.MODEL.load() done = False reward = 0.0 env = ObstacleTowerEnv(environment_filename=self.SCHEDULE.ENV_PATH, worker_id=self.SCHEDULE.N_WORKER + 1, retro=False, realtime_mode=True) obs = env.reset() previous_preprocessed_observation_image = obs[0] while not done: action = self.MODEL.choose_action( previous_preprocessed_observation_image) # 做出動作,獲得場景資訊,已過關數,代理資訊 for _ in self.table_action[int(action)]: observation, reward, done, info = env.step(_) print( "Action_Chose: ", action, "Action: ", _, " Reward: ", reward) if done: break # 預處理模型需要的資料 observation_image, keys, time_remaining = observation preprocessed_observation_image = observation_image previous_preprocessed_observation_image = preprocessed_observation_image env.close()
def __init__(self, result_queue, idx, save_dir, params): super(Worker, self).__init__() self.result_queue = result_queue self.worker_idx = idx self.save_dir = save_dir self.model_path = os.path.join(self.save_dir, 'model_a3c') self.env = ObstacleTowerEnv(params['env_path'], worker_id=self.worker_idx, retro=False, realtime_mode=False, greyscale=False, config=train_env_reset_config) self.action_size = params['action_size'] self._action_lookup = params['action_lookup'] self.input_shape = self.env.observation_space[0].shape # (84, 84, 3) self._last_health = 99999. self._last_keys = 0 self.global_model = params['global_model'] # self.local_model = CNN(self.action_size, self.input_shape) self.local_model = CnnGru(self.action_size, self.input_shape) self.ac_ckpt = params['ckpt'] self.ac_manager = params['ckpt_mgr'] self.current_time = params['log_timestamp'] train_log_dir = './logs/' + self.current_time + '/worker_' + str( self.worker_idx) self.worker_summary_writer = tf.summary.create_file_writer( train_log_dir) self.timesteps = params['timesteps'] self.batch_size = params['batch_size'] self.gamma = params['gamma'] self.lr = params['lr'] self.opt = params['optimizer'] self.eps = np.finfo(np.float32).eps.item()
def __init__(self, envpath, wid, retro, realtime_mode, env_seed=0, env_floor=0): self.wid = wid self.env = ObstacleTowerEnv(environment_filename=envpath, worker_id=wid, retro=retro, realtime_mode=realtime_mode) self.kprun = GLOBAL_KPRUN self.tableAction = self.createActionTable() # 設定關卡 self.env_seed = env_seed self.env_floor = env_floor self.step = 0 self.summary = tf.Summary(value=[ tf.Summary.Value(tag="Stage_reward " + str(self.wid), simple_value=0) ]) self.kprun.train_writer.add_summary(self.summary, 0)
def create_otc_environment(environment_filename=None, docker_training=False, worker_id=0, retro=True, timeout_wait=30, realtime_mode=False): env = ObstacleTowerEnv(environment_filename=environment_filename, docker_training=docker_training, worker_id=worker_id, retro=retro, timeout_wait=timeout_wait, realtime_mode=realtime_mode) env = OtcPreprocessing(env) return env
def parse_unity_environment(env_name): ''' Generates a regym.environments.Task generated by creating a Unity Environment (mlagents-envs) and extracting data from the environment. :param env_name: Path to Unity Executable :returns: Task created from :param: env_name ''' if 'obstacletower' not in env_name: raise ValueError('Only obstacletower environment currently supported') from obstacle_tower_env import ObstacleTowerEnv env = ObstacleTowerEnv( env_name, retro=True, realtime_mode=False ) # retro=True mode creates an observation space of a 64x64 (Box) image return parse_gym_environment(env)
def _thunk(): env = ObstacleTowerEnv('../ObstacleTower/obstacletower', retro=True, worker_id=rank, realtime_mode=show, config={'total-floors': 20}) env.seed(seed + rank % 8) env = bench.Monitor(env, None, allow_early_resets=True) env = OTWrapper(env) env = FrameStack(env, 4) return env
def main(): if len(sys.argv) != 2: sys.stderr.write('Usage: python record_tail.py <start_floor>\n') sys.exit(1) start_floor = int(sys.argv[1]) viewer = EnvInteractor() env = ObstacleTowerEnv(os.environ['OBS_TOWER_PATH'], worker_id=random.randrange(11, 20)) while True: seed = select_seed(floor=start_floor) env.seed(seed) env.floor(start_floor) obs = env.reset() viewer.reset() record_episode(seed, env, viewer, obs, max_steps=MAX_STEPS)
def __init__(self, env_path, train, evaluate, policy_name='CnnPolicy', save_dir='./model_files/', eval_seeds=[], reduced_action=False): self.save_dir = save_dir if not os.path.exists(save_dir): os.makedirs(save_dir) self.model_path = os.path.join(self.save_dir, 'model_stable_ppo') self.log_dir = './logs/stable_ppo' self.policy_name = policy_name self.evaluate = evaluate print(env_path) if reduced_action: from obstacle_tower_env import ObstacleTowerEnv, ObstacleTowerEvaluation from models.common.constants import train_env_reset_config_industrial as train_env_reset_config from models.common.constants import eval_env_reset_config_industrial as eval_env_reset_config else: from models.stable_baselines.reduced_action_env import ObstacleTowerEnv, ObstacleTowerEvaluation from models.common.constants import train_env_reset_config, eval_env_reset_config if train: self.env = ObstacleTowerEnv(env_path, worker_id=0, retro=True, realtime_mode=False, config=train_env_reset_config) else: if evaluate: self.env = ObstacleTowerEnv(env_path, worker_id=0, retro=True, realtime_mode=False, config=eval_env_reset_config) self.env = ObstacleTowerEvaluation(self.env, eval_seeds) else: self.env = ObstacleTowerEnv(env_path, worker_id=0, retro=True, realtime_mode=True, config=eval_env_reset_config)
def __init__(self, params): # Gives common variables to all environments super().__init__() try: from obstacle_tower_env import ObstacleTowerEnv except: print( "Failed to import ObstacleTowerEnv, make sure you have Obstacle Tower installed!" ) # Handle Parameters env_name = params['env_name'] # Create GYM instance env = ObstacleTowerEnv('./ObstacleTower/obstacletower', retro=False) self.action_space = self.env.action_space self.observation_space = self.env.observation_space # Store for later self.disc = gym.spaces.discrete.Discrete # Define header #TODO: Check all open ai gym envs to see if action space works the same # Workout num_classes based on action_space type if type(self.observation_space) == self.disc: self.out = [self.observation_space.n] else: self.out = list(self.observation_space.shape) self.header = header(env_name=env_name, input_dim=self.action_space.n, output_dim=self.out, num_classes=2, info="", env_min_score=0.0, env_max_score=200.0, rl=True)
def seed_hashes(): mapping = {} while len(mapping) < 100: if os.path.exists('UnitySDK.log'): os.remove('UnitySDK.log') while True: try: env = ObstacleTowerEnv(os.environ['OBS_TOWER_PATH'], worker_id=random.randrange(1000)) break except KeyboardInterrupt: sys.exit(1) except: pass env.seed(25) # random argument obs = env.reset() env.close() with open('UnitySDK.log') as f: contents = next(l for l in f.readlines() if 'seed:' in l) seed = int(contents.split(': ')[-1]) yield str(obs.flatten().tolist()), seed return mapping
def __init__(self, env_path, train, evaluate, policy_name='CnnPolicy', save_dir='./model_files/', eval_seeds=[]): self.save_dir = save_dir if not os.path.exists(save_dir): os.makedirs(save_dir) self.model_path = os.path.join(self.save_dir, 'model_stable_a2c') self.log_dir = './logs/stable_a2c' self.policy_name = policy_name self.evaluate = evaluate if train: self.env = ObstacleTowerEnv(env_path, worker_id=0, retro=True, realtime_mode=False, config=train_env_reset_config) else: if evaluate: self.env = ObstacleTowerEnv(env_path, worker_id=0, retro=True, realtime_mode=False, config=eval_env_reset_config) self.env = ObstacleTowerEvaluation(self.env, eval_seeds) else: self.env = ObstacleTowerEnv(env_path, worker_id=0, retro=True, realtime_mode=True, config=eval_env_reset_config)
def __init__(self, env_path, train=False, evaluate=False, eval_seeds=[], max_eps=100, save_dir=None, plot=False): if train: self.env = ObstacleTowerEnv(env_path, worker_id=0, retro=False, realtime_mode=False, config=train_env_reset_config) else: if evaluate: self.env = ObstacleTowerEnv(env_path, worker_id=0, retro=False, realtime_mode=False, config=eval_env_reset_config) self.env = ObstacleTowerEvaluation(self.env, eval_seeds) else: self.env = ObstacleTowerEnv(env_path, worker_id=0, retro=False, realtime_mode=True, config=eval_env_reset_config) self.max_episodes = max_eps self.global_moving_average_reward = 0 self.save_dir = save_dir if not os.path.exists(save_dir): os.makedirs(save_dir) self.plot = plot self.res_queue = Queue()
import os from obstacle_tower_env import ObstacleTowerEnv env = ObstacleTowerEnv(os.environ['OBS_TOWER_PATH'], worker_id=0) env.seed(72) env.floor(12) env.reset() for action in [ 18, 18, 18, 18, 18, 18, 30, 24, 24, 21, 18, 18, 30, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 24, 18, 30, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 30, 30, 30, 30, 24, 24, 6, 6, 6, 6, 6, 6, 6, 6, 30, 30, 30, 30, 30, 18, 24, 24, 24, 6, 6, 6, 6, 6, 6, 24, 18, 24, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 6, 6, 6, 6, 24, 24, 24, 18, 30, 18, 18, 30, 18, 30, 30, 18, 18, 18, 18, 18, 18, 18, 18, 30, 24, 24, 30, 30, 24, 24, 24, 30, 30, 30, 30, 30, 18, 18, 18, 18, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 24, 24, 24, 24, 24, 24, 24, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 24, 18, 18, 30, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 24, 18, 30, 18, 18, 18, 18, 30, 30, 30, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 30, 18, 18, 30, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 30, 24, 24, 24, 24, 24, 24, 24, 24, 18, 30, 18, 18, 18, 18, 30, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 30, 30, 30, 30, 30, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 30, 24, 21, 18, 24, 24, 24, 24, 18, 18, 18, 24, 18, 18, 18, 18, 30, 18, 18, 24, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 24, 24, 24, 24, 24, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 30, 30, 30, 18, 18, 30, 30, 30, 30, 30, 30, 12, 12, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 18, 18, 18, 18, 18, 18, 18, 18,
from obstacle_tower_env import ObstacleTowerEnv import numpy as np from matplotlib import pyplot as plt # Realtime mode determines whether the environment window will render the scene, # as well as whether the environment will run at realtime speed. Set this to `True` # to visual the agent behavior as you would in player mode. env = ObstacleTowerEnv('./ObstacleTower/obstacletower', retro=False, realtime_mode=True) # The environment provided has a MultiDiscrete action space, where the 4 dimensions are: # 0. Movement (No-Op/Forward/Back) # 1. Camera Rotation (No-Op/Counter-Clockwise/Clockwise) # 2. Jump (No-Op/Jump) # 3. Movement (No-Op/Right/Left) print(env.action_space.nvec) print(env.observation_space) #plt.imshow(obs[0]) #plt.show() #print(env.unwrapped.get_action_meanings()) # tower 0, floor 10 = second room holds key config = {'tower-seed': 0, 'starting-floor': 10, 'agent-perspective': 0, 'allowed-rooms': 1, 'allowed-modules': 0, 'allowed-floors': 0} obs = env.reset(config=config) action = env.action_space.sample()
def main(): #Load parse parameters #parser = otc_arg_parser() #args = parser.parse_args() #Challenge environment # if args.env == 'ObtRetro-v6': # env = ObstacleTowerEnv( # '/home/home/Data/Carmen/py_workspace/ObstacleTower_v3/ObstacleTower-v3.1/obstacletower.x86_64', # timeout_wait=6000, # retro=args.retro, # realtime_mode=args.test) # env = RetroWrapper(env, args.sample_normal) # env = OTCPreprocessing(env, args.action_reduction) # # if show_obs: # # env = RenderObservations(env) # # env = KeyboardControlWrapper(env) # else: env = ObstacleTowerEnv( '/home/home/Data/Carmen/py_workspace/ObstacleTower_v3/ObstacleTower-v3.1/obstacletower.x86_64', retro=args.retro, realtime_mode=args.test, timeout_wait=6000) #env = ObstacleTowerEnv('OBSTACLE_TOWER_PATH', retro=args.retro, realtime_mode=args.test, timeout_wait=6000) #Dict of actions created by the ObstacleTowerEnv Class of obstacle_tower_env library #print("ACTIONS:", env._flattener.action_lookup) print('FEATURES :', args.features) #Preprocess the environment (Grey Scales and action space reduction) env = OTCPreprocessing(env, args.action_reduction, args.features) env = DummyVecEnv([lambda: env]) #env = VecEnv(1, env.observation_space, env.action_space) print("ACTION SPACE ///////////:", env.action_space) print("OBSERVATION SPACE ///////////////:", env.observation_space) #env = make_vec_env(env, n_envs=4) ########Training######## #Study of the impact of different values of the PPO params if args.study: params_test(MlpPolicy, env) #If no Study Mode else: #If no Test Mode if not args.test: seed = random.seed(0) if args.pretrained_model: t = 300000 model = PPO2.load(args.pretrained_model, env=env, tensorboard_log=args.tensorboard_logdir) else: t = 0 #If Generalized Advantage Estimator is used if args.use_gae: model = PPO2(MlpPolicy, env, n_steps=args.num_steps, verbose=1, tensorboard_log=args.tensorboard_logdir, cliprange=args.clip_param, learning_rate=args.lr, ent_coef=args.entropy_coef, vf_coef=args.value_loss_coef, max_grad_norm=args.max_grad_norm, gamma=args.gamma, lam=args.gae_lambda, noptepochs=args.ppo_epoch, seed=seed) #If Generalized Advantage Estimator is not used else: model = PPO2(MlpPolicy, env, n_steps=args.num_steps, verbose=1, tensorboard_log=args.tensorboard_logdir, cliprange=args.clip_param, learning_rate=args.lr, ent_coef=args.entropy_coef, vf_coef=args.value_loss_coef, max_grad_norm=args.max_grad_norm, gamma=args.gamma, noptepochs=args.ppo_epoch, seed=seed) else: model = PPO2.load(args.pretrained_model, env=env) #model.learn(total_timesteps=50000) #model.save("ObstacleTower_prueba") filename = 'argsparams.txt' os.makedirs(args.results_dir, exist_ok=True) myfile = open(args.results_dir + filename, 'a') myfile.write( 'clip range: %f \n learning rate: %f \n coeficiente de entropía: %f \n coeficiente de pérdida: %f \n ' 'máximo gradiente: %f \n gamma: %f \n ppo epoch: %f \n' % (args.clip_param, args.lr, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.gamma, args.ppo_epoch)) myfile.close() if not args.test: while t < args.num_env_steps: #TRAIN MODEL if t == 0: model.learn(total_timesteps=args.eval_interval) else: model.learn(total_timesteps=args.eval_interval, reset_num_timesteps=False) os.makedirs(GLOBAL_PATH, exist_ok=True) print("Saving in '" + GLOBAL_PATH + "'") model.save(GLOBAL_PATH + args.training_name + "_" + str(int(t)).zfill(10)) avg_reward, avg_floor = test( t, model, env=env, global_path=args.results_dir) # Test log('T = ' + str(t) + ' / ' + str(args.num_env_steps) + ' | Avg. reward: ' + str(avg_reward) + ' | Avg. floor: ' + str(avg_floor)) t += args.eval_interval else: obs = env.reset() t = 0 while t < args.num_env_steps: action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) #print('action :', info) env.render('rgb_array')
class Worker(threading.Thread): episode_count = 0 mean_reward = 0 best_score = 0 global_steps = 0 save_lock = threading.Lock() def __init__(self, result_queue, idx, save_dir, params): super(Worker, self).__init__() self.result_queue = result_queue self.worker_idx = idx self.save_dir = save_dir self.model_path = os.path.join(self.save_dir, 'model_a3c') self.env = ObstacleTowerEnv(params['env_path'], worker_id=self.worker_idx, retro=False, realtime_mode=False, greyscale=False, config=train_env_reset_config) self.action_size = params['action_size'] self._action_lookup = params['action_lookup'] self.input_shape = self.env.observation_space[0].shape # (84, 84, 3) self._last_health = 99999. self._last_keys = 0 self.global_model = params['global_model'] # self.local_model = CNN(self.action_size, self.input_shape) self.local_model = CnnGru(self.action_size, self.input_shape) self.ac_ckpt = params['ckpt'] self.ac_manager = params['ckpt_mgr'] self.current_time = params['log_timestamp'] train_log_dir = './logs/' + self.current_time + '/worker_' + str( self.worker_idx) self.worker_summary_writer = tf.summary.create_file_writer( train_log_dir) self.timesteps = params['timesteps'] self.batch_size = params['batch_size'] self.gamma = params['gamma'] self.lr = params['lr'] self.opt = params['optimizer'] self.eps = np.finfo(np.float32).eps.item() def get_updated_reward(self, reward, new_health, new_keys, done): new_health = float(new_health) new_reward = 0.0 if done: # reset params when game is terminated self._last_health = 99999. self._last_keys = 0 else: # opened a door, solved a puzzle, picked up a key if 0.1 <= reward < 1: new_reward += 0.5 # crossing a floor - between [1, 4] if reward >= 1: new_reward += (new_health / 10000) # found time orb / crossed a floor if new_health > self._last_health: new_reward += 0.5 return new_reward def log_worker_metrics(self, episode_reward, loss, step): with self.worker_summary_writer.as_default(): with tf.name_scope('worker'): tf.summary.scalar('reward', episode_reward, step=step) tf.summary.scalar('loss', loss, step=step) self.worker_summary_writer.flush() def run(self): mem = Memory() ep_count = 0 timestep = 0 entropy_term = 0 ep_reward = 0. ep_steps = 0 ep_loss = 0. done = False obs = self.env.reset() state, self._last_keys, self._last_health, _ = obs while timestep <= self.timesteps: i = 0 with tf.GradientTape() as tape: while i < self.batch_size: # collect experience # get action as per policy state = tf.convert_to_tensor(state) state = tf.expand_dims(state, axis=0) action_probs, critic_value = self.local_model( [state, float(self._last_health)], training=True) entropy = -np.sum(action_probs * np.log(action_probs)) entropy_term += entropy # choose most probable action dist = tfp.distributions.Categorical(probs=action_probs, dtype=tf.float32) action_index = int(dist.sample().numpy()) action = self._action_lookup[action_index] # perform action in game env for i in range(4): # frame skipping obs, reward, done, _ = self.env.step(action) state, new_keys, new_health, cur_floor = obs reward = self.get_updated_reward( reward, new_health, new_keys, done) self._last_health = new_health self._last_keys = new_keys ep_reward += reward ep_steps += 1 i += 1 timestep += 1 # store experience mem.store(action_prob=tf.math.log( action_probs[0, action_index]), value=critic_value[0, 0], reward=reward) if done: break # backpropagation total_loss = self.local_model.compute_loss( mem, state, done, self.gamma, self.eps, entropy_term) ep_loss += total_loss Worker.global_steps += ep_steps grads = tape.gradient(total_loss, self.local_model.trainable_variables ) # calculate local gradients self.opt.apply_gradients( zip(grads, self.global_model.trainable_variables) ) # send local gradients to global model self.local_model.set_weights(self.global_model.get_weights( )) # update local model with new weights mem.clear() if done: Worker.mean_reward = (Worker.mean_reward * Worker.episode_count + ep_reward) / (Worker.episode_count + 1) self.log_worker_metrics(ep_reward, ep_loss, ep_count) print( "Episode: {} | Mean Reward: {:.3f} | Episode Reward: {:.3f} | Loss: {:.3f} | Steps: {} | Total Steps: {} | Worker: {}" .format(Worker.episode_count, Worker.mean_reward, ep_reward, ep_loss, ep_steps, Worker.global_steps, self.worker_idx)) self.result_queue.put((Worker.mean_reward, total_loss)) Worker.episode_count += 1 ep_count += 1 obs = self.env.reset() state, _, _, _ = obs # use a lock to save local model and to print to prevent data races. if ep_reward > Worker.best_score: with Worker.save_lock: self.ac_manager.save() print("Saved checkpoint for step {}".format( int(self.ac_ckpt.step))) self.ac_ckpt.step.assign_add(1) keras.models.save_model(self.global_model, self.model_path) print('\nSaved best model to: {}, episode score: {}\n'. format(self.model_path, ep_reward)) Worker.best_score = ep_reward entropy_term = 0 ep_reward = 0. ep_steps = 0 ep_loss = 0. self.result_queue.put(None) self.env.close()
TEST_STEPS = 2000 TRAINING_INTERVAL_STEPS = 10000 TOTAL_TRAINING_STEPS = 1e12 RESULTS_PATH = "/home/home/Data/Carmen/py_workspace/ObstacleTower/results/" + datetime.now( ).strftime("%B-%d-%Y_%H_%M%p") TRAINING_NAME = "dqn_train_my_cnn" AGENT_ALGORITHM = "DQN" # DDPG, PPO2, TRPO, DQN PRETRAINED_MODEL = "/home/home/Data/Carmen/py_workspace/ObstacleTower/results/July-30-2019_21_36PM_dqn_train_my_cnn/dqn_train_my_cnn_0000290000.pkl" #PRETRAINED_MODEL = None TEST_ONLY = True NEURONAL_NETWORK = "CNN" # CNN, MLP config = {"visual-theme": 2, "dense-reward": 1} env = ObstacleTowerEnv('./ObstacleTower/obstacletower', retro=True, realtime_mode=True, config=config) # env.seed(5) input_shape = env.observation_space.shape print("OBS", env.observation_space) #print("OBS", env.observation_space[0]) print("INPUT SHAPE", input_shape) env = DummyVecEnv([lambda: env]) # Create global experiments path if not TEST_ONLY: global_path = RESULTS_PATH + "_" + TRAINING_NAME + "/" else: global_path = RESULTS_PATH + "_" + TRAINING_NAME + "_test" + "/" os.makedirs(global_path, exist_ok=True)
steps = [] return episode_reward def run_evaluation(env): while not env.done_grading(): run_episode(env) env.reset() if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('environment_filename', default='./ObstacleTower/obstacletower', nargs='?') parser.add_argument('--docker_training', action='store_true') parser.set_defaults(docker_training=False) args = parser.parse_args() env = ObstacleTowerEnv(args.environment_filename, docker_training=args.docker_training, realtime_mode=True) net = get_model() optimizer = tf.train.AdamOptimizer() if env.is_grading(): episode_reward = run_evaluation(env) else: while True: episode_reward = run_episode(env) print("Episode reward: " + str(episode_reward)) env.close()