def _thunk(): env = ObstacleTowerEnv('../ObstacleTower/obstacletower', retro=True, worker_id=rank, realtime_mode=show, config={'total-floors': 20}) env.seed(seed + rank % 8) env = bench.Monitor(env, None, allow_early_resets=True) env = OTWrapper(env) env = FrameStack(env, 4) return env
def main(): if len(sys.argv) != 2: sys.stderr.write('Usage: python record_tail.py <start_floor>\n') sys.exit(1) start_floor = int(sys.argv[1]) viewer = EnvInteractor() env = ObstacleTowerEnv(os.environ['OBS_TOWER_PATH'], worker_id=random.randrange(11, 20)) while True: seed = select_seed(floor=start_floor) env.seed(seed) env.floor(start_floor) obs = env.reset() viewer.reset() record_episode(seed, env, viewer, obs, max_steps=MAX_STEPS)
def seed_hashes(): mapping = {} while len(mapping) < 100: if os.path.exists('UnitySDK.log'): os.remove('UnitySDK.log') while True: try: env = ObstacleTowerEnv(os.environ['OBS_TOWER_PATH'], worker_id=random.randrange(1000)) break except KeyboardInterrupt: sys.exit(1) except: pass env.seed(25) # random argument obs = env.reset() env.close() with open('UnitySDK.log') as f: contents = next(l for l in f.readlines() if 'seed:' in l) seed = int(contents.split(': ')[-1]) yield str(obs.flatten().tolist()), seed return mapping
def main(): if len(sys.argv) != 2: sys.stderr.write('Usage: record_improve.py <recording_path>\n') os.exit(1) rec = Recording(sys.argv[1]) env = ObstacleTowerEnv(os.environ['OBS_TOWER_PATH'], worker_id=random.randrange(11, 20)) try: env.seed(rec.seed) if rec.floor: env.floor(rec.floor) env.reset() i = 0 for i, (action, rew) in enumerate(zip(rec.actions, rec.rewards)): _, real_rew, done, _ = env.step(action) if not np.allclose(real_rew, rew): print('mismatching result at step %d' % i) sys.exit(1) if done != (i == rec.num_steps - 1): print('invalid done result at step %d' % i) sys.exit(1) print('match succeeded') finally: env.close()
import os from obstacle_tower_env import ObstacleTowerEnv env = ObstacleTowerEnv(os.environ['OBS_TOWER_PATH'], worker_id=0) env.seed(72) env.floor(12) env.reset() for action in [ 18, 18, 18, 18, 18, 18, 30, 24, 24, 21, 18, 18, 30, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 24, 18, 30, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 30, 30, 30, 30, 24, 24, 6, 6, 6, 6, 6, 6, 6, 6, 30, 30, 30, 30, 30, 18, 24, 24, 24, 6, 6, 6, 6, 6, 6, 24, 18, 24, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 6, 6, 6, 6, 24, 24, 24, 18, 30, 18, 18, 30, 18, 30, 30, 18, 18, 18, 18, 18, 18, 18, 18, 30, 24, 24, 30, 30, 24, 24, 24, 30, 30, 30, 30, 30, 18, 18, 18, 18, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 24, 24, 24, 24, 24, 24, 24, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 24, 18, 18, 30, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 24, 18, 30, 18, 18, 18, 18, 30, 30, 30, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 30, 18, 18, 30, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 30, 24, 24, 24, 24, 24, 24, 24, 24, 18, 30, 18, 18, 18, 18, 30, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 30, 30, 30, 30, 30, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 30, 24, 21, 18, 24, 24, 24, 24, 18, 18, 18, 24, 18, 18, 18, 18, 30, 18, 18, 24, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 24, 24, 24, 24, 24, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 30, 30, 30, 18, 18, 30, 30, 30, 30, 30, 30, 12, 12, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 18, 18, 18, 18, 18, 18, 18, 18,
class Worker(object): def __init__(self, envpath, wid, retro, realtime_mode, env_seed=0, env_floor=0): self.wid = wid self.env = ObstacleTowerEnv(environment_filename=envpath, worker_id=wid, retro=retro, realtime_mode=realtime_mode) self.kprun = GLOBAL_KPRUN self.tableAction = self.createActionTable() # 設定關卡 self.env_seed = env_seed self.env_floor = env_floor self.step = 0 self.summary = tf.Summary(value=[ tf.Summary.Value(tag="Stage_reward " + str(self.wid), simple_value=0) ]) self.kprun.train_writer.add_summary(self.summary, 0) def createActionTable(self): tableAction = [] for a in range(0, 3): for b in range(0, 3): for c in range(0, 2): tableAction.append([a, b, c, 0]) # print("Action option: ", tableAction[0:17]) return tableAction def reward_compute(self, done, reward_total, keys, previous_keys, reward, previous_reward, time_remaining, previous_time_remaining, previous_stage_time_remaining): # 定義獎勵公式 # reward 是從環境傳來的破關數 # keys 是撿到鑰匙的數量 # time_remaining 是剩餘時間 # 過關最大獎勵為10 # 一把鑰匙為5 # 時間果實暫時只給0.5,因為結束會結算剩餘時間,會有獎勵累加的問題。 # 如果過關,給予十倍過關獎勵 - (場景開始的時間-剩餘時間)/1000 # print("time_remaining ", time_remaining, # " previous_time_remaining ", previous_time_remaining, # " reward ", reward) # 通過一個會開門的綠門會加0.1 if (reward - previous_reward) > 0 and (reward - previous_reward) < 0.3: reward_total += 3 elif (reward - previous_reward) > 0.9: # ***如果剩餘時間比場景時間多會變成加分獎勵,可能會極大增加Agent吃時間果實的機率。 # ***另一種方式是剩餘的時間直接/1000加上去,這樣就沒有累加效果。 print("Pass ", reward, " Stage!") # reward_total += (reward - previous_reward) * 100 - \ # (previous_stage_time_remaining - time_remaining) reward_total += 200 # 過關之後把時間留到下一關,儲存這回合時間供下次計算過關使用 previous_time_remaining = time_remaining previous_stage_time_remaining = time_remaining # Lesson 1 repeat if reward > 6.5: # self.total_step +=1 # if self.total_step >=5: # done = True # return reward_total, previous_stage_time_remaining, done self.env.seed(np.random.randint(5)) # env.reset() done = True return reward_total, previous_stage_time_remaining, done # 假設過關的時候有順便吃到果實或鑰匙,所以預設為同時可以加成 if previous_keys > keys: # print("Get Key") reward_total += 5 if previous_time_remaining < time_remaining and previous_time_remaining != 0: # print("Get time power up") reward_total += 2 else: reward_total -= 0.5 if done and previous_time_remaining > 100: print("Agent died") # 如果剩餘時間越多就掛點,扣更多 # reward_total -= (10 + time_remaining / 100) reward_total -= 100 return reward_total, previous_stage_time_remaining, done def work(self): global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER # 設定關卡 self.env.seed(self.env_seed) self.env.floor(self.env_floor) # 只要還沒達到目標回合就LOOP while not COORD.should_stop(): # 紀錄步數 self.step += 1 # 重設關卡 obs = self.env.reset() # 初始化 done = False stage_reward = 0.0 reward = 0 keys = 0 # 檢查是否有吃到加時間的,如果是第一回合出來沒有time_remaining,事先定義 time_remaining = 3000 previous_stage_time_remaining = time_remaining # 預處理圖像 # previous_preprocessed_observation_image = np.reshape(obs[0], [-1]) previous_preprocessed_observation_image = obs[0] buffer_s, buffer_a, buffer_r = [], [], [] # 只要沒死 while not done: # 如果模型正在更新就等待更新完成 if not ROLLING_EVENT.is_set(): # 等待更新完成 ROLLING_EVENT.wait() # 清除記憶體,使用新的代理收集資料 buffer_s, buffer_a, buffer_r = [], [], [] # 儲存上一個動作狀態,供計算獎勵用 previous_keys = keys previous_reward = reward previous_time_remaining = time_remaining # 根據上一次的狀態決定動作 action = self.kprun.choose_action( previous_preprocessed_observation_image) action = np.clip(np.random.normal(action, 1.), *[6, 12]) # 做出動作,獲得場景資訊,已過關數,代理資訊 observation, reward, done, info = self.env.step( np.array(self.tableAction[int(action)])) # 預處理模型需要的資料 observation_image, keys, time_remaining = observation # preprocessed_observation_image = np.reshape( # observation_image, [-1]) preprocessed_observation_image = observation_image stage_reward, previous_stage_time_remaining, done = self.reward_compute( done=done, reward_total=stage_reward, keys=keys, previous_keys=previous_keys, reward=reward, previous_reward=previous_reward, time_remaining=time_remaining, previous_time_remaining=previous_time_remaining, previous_stage_time_remaining=previous_stage_time_remaining ) # Normalize reward~不知道中文怎麼打 stage_reward = stage_reward + 8 / 8 # 把這次狀態存入 記憶體 buffer_s.append(np.array([preprocessed_observation_image])) buffer_a.append(action) buffer_r.append(stage_reward) # 儲存下一步要參考的圖像 previous_preprocessed_observation_image = preprocessed_observation_image # 達到更新時,自己先做處理。 GLOBAL_UPDATE_COUNTER += 1 # 太多自己就先處理更新 if len(buffer_s) == EP_LEN - \ 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE: v_s_ = self.kprun.get_v(preprocessed_observation_image) # 計算折扣獎勵 discounted_r = [] for r in buffer_r[::-1]: v_s_ = r + GAMMA * v_s_ discounted_r.append(v_s_) discounted_r.reverse() # 整理維度 bs, ba, br = np.vstack(buffer_s), np.vstack( buffer_a), np.array(discounted_r)[:, np.newaxis] # 把資料放入共享記憶體 QUEUE.put(bs) QUEUE.put(ba) QUEUE.put(br) # print("len(buffer_s)", len(buffer_s)) # print("bs.shape", bs.shape) # 清空暫存 buffer_s, buffer_a, buffer_r = [], [], [] # 如果整個模型步數到達最小BATCH 就整個更新 if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE: # 停止收集資料 ROLLING_EVENT.clear() # 更新PPO UPDATE_EVENT.set() # 達到最多EP停止訓練 if GLOBAL_EP >= EP_MAX: COORD.request_stop() break # 紀錄獎勵 self.summary = tf.Summary(value=[ tf.Summary.Value(tag="Stage_reward " + str(self.wid), simple_value=stage_reward) ]) self.kprun.train_writer.add_summary(self.summary, self.step) GLOBAL_EP += 1 print( '{0:.1f}%'.format(GLOBAL_EP / EP_MAX * 100), '|W%i' % self.wid, '|Ep_r: %.2f' % stage_reward, ) self.env.close()
import json import os from anyrl.utils.ffmpeg import export_video import numpy as np from obstacle_tower_env import ObstacleTowerEnv from obs_tower2.util import big_obs with open('stuck_box.json', 'r') as in_file: data = json.load(in_file) env = ObstacleTowerEnv(os.environ['OBS_TOWER_PATH'], worker_id=1) env.seed(56) env.reset() def f(): for i, act in enumerate(data): obs, _, _, info = env.step(act) if i > 5275: yield big_obs(obs, info) export_video('stuck_box.mp4', 168, 168, 10, f())
os.remove(os.path.join(dir, file)) parser = argparse.ArgumentParser() parser.add_argument('environment_filename', default='./ObstacleTower/obstacletower', nargs='?') parser.add_argument('--docker_training', action='store_true') parser.set_defaults(docker_training=False) args = parser.parse_args() env = ObstacleTowerEnv(args.environment_filename, docker_training=args.docker_training, retro=False, realtime_mode=False) logger.setLevel(logging.WARNING) env.seed(4) if env.is_grading(): episode_reward = run_evaluation(env) else: total_frames = 0 episode_number = 0 while True: episode_number += 1 total_frames += run_episode(env, episode_number) if episode_number % 200 == 0: print(f'Total Frames: {total_frames}') episode_reward = run_episode(env, episode_number, test=True) if episode_number >= MAX_EPISODES:
import os import random from obstacle_tower_env import ObstacleTowerEnv counter = {} env = ObstacleTowerEnv(os.environ['OBS_TOWER_PATH'], worker_id=2) while True: env.seed(random.randrange(100)) env.reset() for _ in range(50): obs, _, _, _ = env.step(0) key = str(obs.flatten().tolist()) counter[key] = True print('got %d start states' % len(counter))
parser.add_argument('environment_filename', default='./ObstacleTower/obstacletower', nargs='?') parser.add_argument('--docker_training', action='store_true') parser.set_defaults(docker_training=False) args = parser.parse_args() env = ObstacleTowerEnv(args.environment_filename, docker_training=args.docker_training, realtime_mode=True) model = get_model() optimizer = tf.train.AdamOptimizer() checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model) checkpoint.restore(tf.train.latest_checkpoint('./tf_saves/')) total_count = 0 for i in range(0, 101): #setup environment env.seed(i) obs = env.reset() reward = 0 actions = [] rerun_actions = False obs = env.reset() while True: observation = process_image(obs) prediction = model(tf.cast([observation], dtype=tf.float32))[0] print('prediction', prediction) selection = np.argmax(prediction) print('selection', selection) action = action_options[selection] print('action', action) obs, step_reward, done, info = env.step(action)
#!/usr/bin/env python3 from obstacle_tower_env import ObstacleTowerEnv from matplotlib import pyplot as plt ENV_PATH = './obstacle-tower-challenge/ObstacleTower/obstacletower' env = ObstacleTowerEnv(ENV_PATH, retro=False, realtime_mode=True) # Seeds can be chosen from range of 0-100. env.seed(5) # Floors can be chosen from range of 0-24. env.floor(15) # The environment provided has a MultiDiscrete action space, where the 4 dimensions are: # 0. Movement (No-Op/Forward/Back) # 1. Camera Rotation (No-Op/Counter-Clockwise/Clockwise) # 2. Jump (No-Op/Jump) # 3. Movement (No-Op/Right/Left) print('action space', env.action_space) # The observation space provided includes a 168x168 image (the camera from the simulation) # as well as the number of keys held by the agent (0-5) and the amount of time remaining. print('observation space', env.observation_space) # Interacting with the environment obs = env.reset()