def testing(self): from keepitpossible.common import action_table self.table_action = action_table.create_action_table() self.MODEL.load() done = False reward = 0.0 env = ObstacleTowerEnv(environment_filename=self.SCHEDULE.ENV_PATH, worker_id=self.SCHEDULE.N_WORKER + 1, retro=False, realtime_mode=True) obs = env.reset() previous_preprocessed_observation_image = obs[0] while not done: action = self.MODEL.choose_action( previous_preprocessed_observation_image) # 做出動作,獲得場景資訊,已過關數,代理資訊 for _ in self.table_action[int(action)]: observation, reward, done, info = env.step(_) print( "Action_Chose: ", action, "Action: ", _, " Reward: ", reward) if done: break # 預處理模型需要的資料 observation_image, keys, time_remaining = observation preprocessed_observation_image = observation_image previous_preprocessed_observation_image = preprocessed_observation_image env.close()
def main(): if len(sys.argv) != 2: sys.stderr.write('Usage: record_improve.py <recording_path>\n') os.exit(1) rec = Recording(sys.argv[1]) env = ObstacleTowerEnv(os.environ['OBS_TOWER_PATH'], worker_id=random.randrange(11, 20)) try: env.seed(rec.seed) if rec.floor: env.floor(rec.floor) env.reset() i = 0 for i, (action, rew) in enumerate(zip(rec.actions, rec.rewards)): _, real_rew, done, _ = env.step(action) if not np.allclose(real_rew, rew): print('mismatching result at step %d' % i) sys.exit(1) if done != (i == rec.num_steps - 1): print('invalid done result at step %d' % i) sys.exit(1) print('match succeeded') finally: env.close()
# 試跑 env = ObstacleTowerEnv('./ObstacleTower/obstacletower.exe', worker_id=10, retro=False, realtime_mode=True) obs = env.reset() print("執行測試環境,如果要離開請按Q") previous_preprocessed_observation_image = np.reshape(obs[0], [-1]) while True: action = GLOBAL_KPRUN.choose_action( previous_preprocessed_observation_image) # 多執行緒會有跑不動的問題 if np.isnan(action): action = np.random.randint(6, high=12) # 做出動作,獲得場景資訊,已過關數,代理資訊 observation, reward, done, info = env.step( np.array(GLOBAL_KPRUN.tableAction[int(action)])) # 預處理模型需要的資料 observation_image, keys, time_remaining = observation preprocessed_observation_image = np.reshape(observation_image, [-1]) if 0xFF == ord('q'): break previous_preprocessed_observation_image = preprocessed_observation_image env.close() if __name__ == '__main__': # 建立物件 GLOBAL_KPRUN = MODEL() # GLOBAL_KPRUN.load() # 建立多執行緒 UPDATE_EVENT, ROLLING_EVENT = threading.Event(), threading.Event() # 現在不更新
class Worker(object): def __init__(self, envpath, wid, retro, realtime_mode, env_seed=0, env_floor=0): self.wid = wid self.env = ObstacleTowerEnv(environment_filename=envpath, worker_id=wid, retro=retro, realtime_mode=realtime_mode) self.kprun = GLOBAL_KPRUN self.tableAction = self.createActionTable() # 設定關卡 self.env_seed = env_seed self.env_floor = env_floor self.step = 0 self.summary = tf.Summary(value=[ tf.Summary.Value(tag="Stage_reward " + str(self.wid), simple_value=0) ]) self.kprun.train_writer.add_summary(self.summary, 0) def createActionTable(self): tableAction = [] for a in range(0, 3): for b in range(0, 3): for c in range(0, 2): tableAction.append([a, b, c, 0]) # print("Action option: ", tableAction[0:17]) return tableAction def reward_compute(self, done, reward_total, keys, previous_keys, reward, previous_reward, time_remaining, previous_time_remaining, previous_stage_time_remaining): # 定義獎勵公式 # reward 是從環境傳來的破關數 # keys 是撿到鑰匙的數量 # time_remaining 是剩餘時間 # 過關最大獎勵為10 # 一把鑰匙為5 # 時間果實暫時只給0.5,因為結束會結算剩餘時間,會有獎勵累加的問題。 # 如果過關,給予十倍過關獎勵 - (場景開始的時間-剩餘時間)/1000 # print("time_remaining ", time_remaining, # " previous_time_remaining ", previous_time_remaining, # " reward ", reward) # 通過一個會開門的綠門會加0.1 if (reward - previous_reward) > 0 and (reward - previous_reward) < 0.3: reward_total += 3 elif (reward - previous_reward) > 0.9: # ***如果剩餘時間比場景時間多會變成加分獎勵,可能會極大增加Agent吃時間果實的機率。 # ***另一種方式是剩餘的時間直接/1000加上去,這樣就沒有累加效果。 print("Pass ", reward, " Stage!") # reward_total += (reward - previous_reward) * 100 - \ # (previous_stage_time_remaining - time_remaining) reward_total += 200 # 過關之後把時間留到下一關,儲存這回合時間供下次計算過關使用 previous_time_remaining = time_remaining previous_stage_time_remaining = time_remaining # Lesson 1 repeat if reward > 6.5: # self.total_step +=1 # if self.total_step >=5: # done = True # return reward_total, previous_stage_time_remaining, done self.env.seed(np.random.randint(5)) # env.reset() done = True return reward_total, previous_stage_time_remaining, done # 假設過關的時候有順便吃到果實或鑰匙,所以預設為同時可以加成 if previous_keys > keys: # print("Get Key") reward_total += 5 if previous_time_remaining < time_remaining and previous_time_remaining != 0: # print("Get time power up") reward_total += 2 else: reward_total -= 0.5 if done and previous_time_remaining > 100: print("Agent died") # 如果剩餘時間越多就掛點,扣更多 # reward_total -= (10 + time_remaining / 100) reward_total -= 100 return reward_total, previous_stage_time_remaining, done def work(self): global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER # 設定關卡 self.env.seed(self.env_seed) self.env.floor(self.env_floor) # 只要還沒達到目標回合就LOOP while not COORD.should_stop(): # 紀錄步數 self.step += 1 # 重設關卡 obs = self.env.reset() # 初始化 done = False stage_reward = 0.0 reward = 0 keys = 0 # 檢查是否有吃到加時間的,如果是第一回合出來沒有time_remaining,事先定義 time_remaining = 3000 previous_stage_time_remaining = time_remaining # 預處理圖像 # previous_preprocessed_observation_image = np.reshape(obs[0], [-1]) previous_preprocessed_observation_image = obs[0] buffer_s, buffer_a, buffer_r = [], [], [] # 只要沒死 while not done: # 如果模型正在更新就等待更新完成 if not ROLLING_EVENT.is_set(): # 等待更新完成 ROLLING_EVENT.wait() # 清除記憶體,使用新的代理收集資料 buffer_s, buffer_a, buffer_r = [], [], [] # 儲存上一個動作狀態,供計算獎勵用 previous_keys = keys previous_reward = reward previous_time_remaining = time_remaining # 根據上一次的狀態決定動作 action = self.kprun.choose_action( previous_preprocessed_observation_image) action = np.clip(np.random.normal(action, 1.), *[6, 12]) # 做出動作,獲得場景資訊,已過關數,代理資訊 observation, reward, done, info = self.env.step( np.array(self.tableAction[int(action)])) # 預處理模型需要的資料 observation_image, keys, time_remaining = observation # preprocessed_observation_image = np.reshape( # observation_image, [-1]) preprocessed_observation_image = observation_image stage_reward, previous_stage_time_remaining, done = self.reward_compute( done=done, reward_total=stage_reward, keys=keys, previous_keys=previous_keys, reward=reward, previous_reward=previous_reward, time_remaining=time_remaining, previous_time_remaining=previous_time_remaining, previous_stage_time_remaining=previous_stage_time_remaining ) # Normalize reward~不知道中文怎麼打 stage_reward = stage_reward + 8 / 8 # 把這次狀態存入 記憶體 buffer_s.append(np.array([preprocessed_observation_image])) buffer_a.append(action) buffer_r.append(stage_reward) # 儲存下一步要參考的圖像 previous_preprocessed_observation_image = preprocessed_observation_image # 達到更新時,自己先做處理。 GLOBAL_UPDATE_COUNTER += 1 # 太多自己就先處理更新 if len(buffer_s) == EP_LEN - \ 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE: v_s_ = self.kprun.get_v(preprocessed_observation_image) # 計算折扣獎勵 discounted_r = [] for r in buffer_r[::-1]: v_s_ = r + GAMMA * v_s_ discounted_r.append(v_s_) discounted_r.reverse() # 整理維度 bs, ba, br = np.vstack(buffer_s), np.vstack( buffer_a), np.array(discounted_r)[:, np.newaxis] # 把資料放入共享記憶體 QUEUE.put(bs) QUEUE.put(ba) QUEUE.put(br) # print("len(buffer_s)", len(buffer_s)) # print("bs.shape", bs.shape) # 清空暫存 buffer_s, buffer_a, buffer_r = [], [], [] # 如果整個模型步數到達最小BATCH 就整個更新 if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE: # 停止收集資料 ROLLING_EVENT.clear() # 更新PPO UPDATE_EVENT.set() # 達到最多EP停止訓練 if GLOBAL_EP >= EP_MAX: COORD.request_stop() break # 紀錄獎勵 self.summary = tf.Summary(value=[ tf.Summary.Value(tag="Stage_reward " + str(self.wid), simple_value=stage_reward) ]) self.kprun.train_writer.add_summary(self.summary, self.step) GLOBAL_EP += 1 print( '{0:.1f}%'.format(GLOBAL_EP / EP_MAX * 100), '|W%i' % self.wid, '|Ep_r: %.2f' % stage_reward, ) self.env.close()
} obs = env.reset(config=config) np.trim_zeros(steps) def handcrafted_step(act, go): try: for i in range(go): env.step(act) except IndexError: pass handcrafted_step(18, steps[0]) handcrafted_step(6, steps[1]) handcrafted_step(18, steps[2]) s, reward, done, info = env.step(18) s = rgb2gray(s) s = np.expand_dims(s, axis=2) t = 0 track_score = [] track_r = [] track_a = [] track_s = [] bad_seq = 0 score = 0 steps_after_key = 0 while True: if RENDER: env.render() a = actor.choose_action(
def main(): basicConfig(level=INFO) env = ObstacleTowerEnv(str(PRJ_ROOT / 'obstacletower'), retro=False, worker_id=9) done = False env.floor(1) env.reset() screen = Screen() random_actor = RandomRepeatActor(continue_rate=0.9) random_actor.reset(schedules=[ (Action.CAMERA_RIGHT, 3), (Action.CAMERA_LEFT, 6), (Action.CAMERA_RIGHT, 3), (Action.NOP, 5), (Action.FORWARD, 8), (Action.RIGHT, 2), (Action.LEFT, 4), (Action.RIGHT, 2), ]) frame_history = FrameHistory(env) moving_checker = MovingChecker(frame_history) position_estimator = PositionEstimator(moving_checker) map_observation = MapObservation(position_estimator, moving_checker) event_handlers: List[EventHandler] = [ frame_history, moving_checker, position_estimator, map_observation, ] while not done: for h in event_handlers: h.begin_loop() screen.show("original", frame_history.last_frame) cv2.waitKey(0) for h in event_handlers: h.before_step() action = random_actor.decide_action(moving_checker.did_move) obs, reward, done, info = env.step(action) if reward != 0: logger.info(f"Get Reward={reward} Keys={obs[1]}") # logger.info(f"Keys={obs[1]} Time Remain={obs[2]}") params = EventParamsAfterStep(action, obs, reward, done, info) for h in event_handlers: h.after_step(params) screen.show("map", map_observation.concat_images()) if len(frame_history.small_frame_pixel_diffs) > 0: f1 = frame_history.small_frame_pixel_diffs[-1] if len(frame_history.small_frame_pixel_diffs) > 1: f2 = frame_history.small_frame_pixel_diffs[-2] f1 = np.concatenate((f2, f1), axis=1) screen.show("diff", f1) for h in event_handlers: h.end_loop()
state = env.reset() #print(state.shape) state = state[0] state = TF.to_tensor(state) print(state.size) scores = [] mean_scores_100 = deque(maxlen=100) version = 'v3' for episode in range(400): timesteps = 0 rewards = 0 for steps in range(10000): timesteps += 1 actions, actions_env_format = agent.select_actions(state) next_state, reward, done, info = env.step(actions_env_format) next_state = next_state[0] next_state = TF.to_tensor(next_state) agent.replay_buffer.add((state, actions, reward, next_state, done)) agent.train() state = next_state rewards += reward if(done): break scores.append(rewards) mean_scores_100.append(rewards) print('episode {} frames {} rewards {:.2f} mean score {:.2f}'.format(episode, timesteps, rewards, np.mean(mean_scores_100))) if(episode % 100 == 0): torch.save(agent.conv_net.state_dict(), 'checkpoints/conv_net_checkpoint_{}.pth'.format(version)) torch.save(agent.critic_v.state_dict(), 'checkpoints/critic_v_checkpoint_{}.pth'.format(version)) torch.save(agent.critic_q_1.state_dict(), 'checkpoints/critic_q_1_checkpoint_{}.pth'.format(version))
# Interacting with the environment obs = env.reset() plt.imshow(obs[0]) # Get action meanings print('Table of actions') for action_id, action_meaning in enumerate(env.get_action_meanings()): print(action_id, action_meaning) import signal def env_closer(signo, handler): print('Closing the environment...') import sys sys.exit(1) env.close() signal.signal(signal.SIGINT, env_closer) while True: sampled_action = env.action_space.sample() print('Sampled action:', sampled_action) obs, reward, done, info = env.step(sampled_action) plt.imshow(obs[0]) print('Reward after action', reward)
class RandomAgent: """Random Agent that will play the specified game Args: env_name: Name of the environment to be played max_eps: Maximum number of episodes to run agent for. """ def __init__(self, env_path, train=False, evaluate=False, eval_seeds=[], max_eps=100, save_dir=None, plot=False): if train: self.env = ObstacleTowerEnv(env_path, worker_id=0, retro=False, realtime_mode=False, config=train_env_reset_config) else: if evaluate: self.env = ObstacleTowerEnv(env_path, worker_id=0, retro=False, realtime_mode=False, config=eval_env_reset_config) self.env = ObstacleTowerEvaluation(self.env, eval_seeds) else: self.env = ObstacleTowerEnv(env_path, worker_id=0, retro=False, realtime_mode=True, config=eval_env_reset_config) self.max_episodes = max_eps self.global_moving_average_reward = 0 self.save_dir = save_dir if not os.path.exists(save_dir): os.makedirs(save_dir) self.plot = plot self.res_queue = Queue() def train(self): start_time = time.time() reward_avg = 0 global_steps = 0 moving_average_rewards = [] for episode in range(self.max_episodes): done = False self.env.reset() reward_sum = 0.0 steps = 0 while not done: # Sample randomly from the action space and step _, reward, done, _ = self.env.step( self.env.action_space.sample()) steps += 1 global_steps += 1 reward_sum += reward if self.plot: # Record statistics moving_average_rewards.append(reward_sum) reward_avg += reward_sum self.global_moving_average_reward = record( episode, reward_sum, 0, self.global_moving_average_reward, self.res_queue, 0, steps, global_steps) end_time = time.time() print("\nTraining complete. Time taken = {} secs".format(end_time - start_time)) final_avg = reward_avg / float(self.max_episodes) print("Average score across {} episodes: {}".format( self.max_episodes, final_avg)) if self.plot: plt.plot(moving_average_rewards) plt.ylabel('Moving average episode reward') plt.xlabel('Step') plt.savefig( os.path.join(self.save_dir, 'model_random_moving_average.png')) self.env.close() return final_avg def play_single_episode(self): action_space = ActionSpace() print("Playing single episode...") done = False step_counter = 0 reward_sum = 0 obs = self.env.reset() state, _, _, _ = obs try: while not done: action = self.env.action_space.sample() obs, reward, done, info = self.env.step(action) reward_sum += reward print("{}. Reward: {}, action: {}".format( step_counter, reward_sum, action_space.get_action_meaning(action))) step_counter += 1 except KeyboardInterrupt: print("Received Keyboard Interrupt. Shutting down.") finally: if not self.evaluate: self.env.close() return reward_sum def evaluate(self): # run episodes until evaluation is complete while not self.env.evaluation_complete: episode_reward = self.play_single_episode() pprint(self.env.results) self.env.close()
class Worker(threading.Thread): episode_count = 0 mean_reward = 0 best_score = 0 global_steps = 0 save_lock = threading.Lock() def __init__(self, result_queue, idx, save_dir, params): super(Worker, self).__init__() self.result_queue = result_queue self.worker_idx = idx self.save_dir = save_dir self.model_path = os.path.join(self.save_dir, 'model_a3c') self.env = ObstacleTowerEnv(params['env_path'], worker_id=self.worker_idx, retro=False, realtime_mode=False, greyscale=False, config=train_env_reset_config) self.action_size = params['action_size'] self._action_lookup = params['action_lookup'] self.input_shape = self.env.observation_space[0].shape # (84, 84, 3) self._last_health = 99999. self._last_keys = 0 self.global_model = params['global_model'] # self.local_model = CNN(self.action_size, self.input_shape) self.local_model = CnnGru(self.action_size, self.input_shape) self.ac_ckpt = params['ckpt'] self.ac_manager = params['ckpt_mgr'] self.current_time = params['log_timestamp'] train_log_dir = './logs/' + self.current_time + '/worker_' + str( self.worker_idx) self.worker_summary_writer = tf.summary.create_file_writer( train_log_dir) self.timesteps = params['timesteps'] self.batch_size = params['batch_size'] self.gamma = params['gamma'] self.lr = params['lr'] self.opt = params['optimizer'] self.eps = np.finfo(np.float32).eps.item() def get_updated_reward(self, reward, new_health, new_keys, done): new_health = float(new_health) new_reward = 0.0 if done: # reset params when game is terminated self._last_health = 99999. self._last_keys = 0 else: # opened a door, solved a puzzle, picked up a key if 0.1 <= reward < 1: new_reward += 0.5 # crossing a floor - between [1, 4] if reward >= 1: new_reward += (new_health / 10000) # found time orb / crossed a floor if new_health > self._last_health: new_reward += 0.5 return new_reward def log_worker_metrics(self, episode_reward, loss, step): with self.worker_summary_writer.as_default(): with tf.name_scope('worker'): tf.summary.scalar('reward', episode_reward, step=step) tf.summary.scalar('loss', loss, step=step) self.worker_summary_writer.flush() def run(self): mem = Memory() ep_count = 0 timestep = 0 entropy_term = 0 ep_reward = 0. ep_steps = 0 ep_loss = 0. done = False obs = self.env.reset() state, self._last_keys, self._last_health, _ = obs while timestep <= self.timesteps: i = 0 with tf.GradientTape() as tape: while i < self.batch_size: # collect experience # get action as per policy state = tf.convert_to_tensor(state) state = tf.expand_dims(state, axis=0) action_probs, critic_value = self.local_model( [state, float(self._last_health)], training=True) entropy = -np.sum(action_probs * np.log(action_probs)) entropy_term += entropy # choose most probable action dist = tfp.distributions.Categorical(probs=action_probs, dtype=tf.float32) action_index = int(dist.sample().numpy()) action = self._action_lookup[action_index] # perform action in game env for i in range(4): # frame skipping obs, reward, done, _ = self.env.step(action) state, new_keys, new_health, cur_floor = obs reward = self.get_updated_reward( reward, new_health, new_keys, done) self._last_health = new_health self._last_keys = new_keys ep_reward += reward ep_steps += 1 i += 1 timestep += 1 # store experience mem.store(action_prob=tf.math.log( action_probs[0, action_index]), value=critic_value[0, 0], reward=reward) if done: break # backpropagation total_loss = self.local_model.compute_loss( mem, state, done, self.gamma, self.eps, entropy_term) ep_loss += total_loss Worker.global_steps += ep_steps grads = tape.gradient(total_loss, self.local_model.trainable_variables ) # calculate local gradients self.opt.apply_gradients( zip(grads, self.global_model.trainable_variables) ) # send local gradients to global model self.local_model.set_weights(self.global_model.get_weights( )) # update local model with new weights mem.clear() if done: Worker.mean_reward = (Worker.mean_reward * Worker.episode_count + ep_reward) / (Worker.episode_count + 1) self.log_worker_metrics(ep_reward, ep_loss, ep_count) print( "Episode: {} | Mean Reward: {:.3f} | Episode Reward: {:.3f} | Loss: {:.3f} | Steps: {} | Total Steps: {} | Worker: {}" .format(Worker.episode_count, Worker.mean_reward, ep_reward, ep_loss, ep_steps, Worker.global_steps, self.worker_idx)) self.result_queue.put((Worker.mean_reward, total_loss)) Worker.episode_count += 1 ep_count += 1 obs = self.env.reset() state, _, _, _ = obs # use a lock to save local model and to print to prevent data races. if ep_reward > Worker.best_score: with Worker.save_lock: self.ac_manager.save() print("Saved checkpoint for step {}".format( int(self.ac_ckpt.step))) self.ac_ckpt.step.assign_add(1) keras.models.save_model(self.global_model, self.model_path) print('\nSaved best model to: {}, episode score: {}\n'. format(self.model_path, ep_reward)) Worker.best_score = ep_reward entropy_term = 0 ep_reward = 0. ep_steps = 0 ep_loss = 0. self.result_queue.put(None) self.env.close()
config = {'tower-seed': 0, 'starting-floor': 10, 'agent-perspective': 0, 'allowed-rooms': 1, 'allowed-modules': 0, 'allowed-floors': 0} obs = env.reset(config=config) action = env.action_space.sample() allowed_action = False allowed_actions = np.array([np.array([1, 0, 0, 0]), np.array([0, 1, 0, 0]), np.array([0, 2, 0, 0]), np.array([1, 0, 1, 0])]) # Took only an action of the allowed actions while not allowed_action: if (allowed_actions == action).all(1).any(): allowed_action = True else: action = env.action_space.sample() action = np.array([1, 0, 0, 0]) obs, reward, done, info = env.step(action) obs, reward, done, info = env.step(action) obs, reward, done, info = env.step(action) obs, reward, done, info = env.step(action) obs, reward, done, info = env.step(action) obs, reward, done, info = env.step(action) obs, reward, done, info = env.step(action) obs, reward, done, info = env.step(action) obs, reward, done, info = env.step(action) obs, reward, done, info = env.step(action) obs, reward, done, info = env.step(action) obs, reward, done, info = env.step(action) obs, reward, done, info = env.step(action) obs, reward, done, info = env.step(action) obs, reward, done, info = env.step(action) obs, reward, done, info = env.step(action)
args = parser.parse_args() env = ObstacleTowerEnv(args.environment_filename, docker_training=args.docker_training, realtime_mode=True) model = get_model() optimizer = tf.train.AdamOptimizer() checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model) checkpoint.restore(tf.train.latest_checkpoint('./tf_saves/')) total_count = 0 for i in range(0, 101): #setup environment env.seed(i) obs = env.reset() reward = 0 actions = [] rerun_actions = False obs = env.reset() while True: observation = process_image(obs) prediction = model(tf.cast([observation], dtype=tf.float32))[0] print('prediction', prediction) selection = np.argmax(prediction) print('selection', selection) action = action_options[selection] print('action', action) obs, step_reward, done, info = env.step(action) env.close()
class StableA2C(): def __init__(self, env_path, train, evaluate, policy_name='CnnPolicy', save_dir='./model_files/', eval_seeds=[]): self.save_dir = save_dir if not os.path.exists(save_dir): os.makedirs(save_dir) self.model_path = os.path.join(self.save_dir, 'model_stable_a2c') self.log_dir = './logs/stable_a2c' self.policy_name = policy_name self.evaluate = evaluate if train: self.env = ObstacleTowerEnv(env_path, worker_id=0, retro=True, realtime_mode=False, config=train_env_reset_config) else: if evaluate: self.env = ObstacleTowerEnv(env_path, worker_id=0, retro=True, realtime_mode=False, config=eval_env_reset_config) self.env = ObstacleTowerEvaluation(self.env, eval_seeds) else: self.env = ObstacleTowerEnv(env_path, worker_id=0, retro=True, realtime_mode=True, config=eval_env_reset_config) def load_model(self): print('Loading model from: {}'.format(self.model_path)) model = A2C.load(self.model_path) model.set_env(self.env) model.tensorboard_log = self.log_dir return model def train(self, timesteps=10000, continue_training=False): start_time = time.time() if not continue_training: print("Initializing from scratch") model = A2C(self.policy_name, self.env, verbose=1, tensorboard_log=self.log_dir) else: model = self.load_model() print("Restored from {}".format(self.model_path)) model.learn(total_timesteps=timesteps) print('\nTraining complete. Time taken = {} secs'.format(time.time() - start_time)) model.save(self.model_path) def play_single_episode(self): """ have the trained agent play a single game """ action_space = ActionSpace() done = False reward_sum = 0 step_counter = 0 model = self.load_model() obs = self.env.reset() try: print("Playing single episode...") while not done: action, _states = model.predict(obs) obs, reward, done, info = self.env.step(action) print("{}. Reward: {}, action: {}".format( step_counter, reward_sum, action_space.get_full_action_meaning(action))) self.env.render() step_counter += 1 reward_sum += reward except KeyboardInterrupt: print("Received Keyboard Interrupt. Shutting down.") finally: if not self.evaluate: self.env.close() print("Environment closed.") print("Game play completed.") return reward_sum def evaluate(self): """ run episodes until evaluation is complete """ while not self.env.evaluation_complete: episode_reward = self.play_single_episode() pprint(self.env.results) self.env.close()
env.reset() reward = 0 actions = [] rerun_actions = False reward_total = 0 while reward_total < 2: reward = 0 if (i_path < len(seed_paths)): reward_before = reward_total while i_path < len( seed_paths) and reward_total < reward_before + 1: current_path = paths[int(seed_paths[int(i_path)])] while path_i < len( current_path) and reward_total < reward_before + 1: current_action = int(current_path[path_i]) obs, reward, done, info = env.step(current_action) reward_total += reward print('loop reward', reward) path_i += 1 i_path += 1 path_i = 0 print('before if reward', reward) if (reward == 0): if rerun_actions: for action in actions: env.step(action) rerun_actions = False else: print("left: 1, right: 2") action = input("action: ") if action == "restart":
class WrappedObstacleTowerEnv(): def __init__(self, environment_filename=None, docker_training=False, worker_id=0, retro=False, timeout_wait=30, realtime_mode=False, num_actions=3, mobilenet=False, gray_scale=False, autoencoder=None, floor=0): ''' Arguments: environment_filename: The file path to the Unity executable. Does not require the extension. docker_training: Whether this is running within a docker environment and should use a virtual frame buffer (xvfb). worker_id: The index of the worker in the case where multiple environments are running. Each environment reserves port (5005 + worker_id) for communication with the Unity executable. retro: Resize visual observation to 84x84 (int8) and flattens action space. timeout_wait: Time for python interface to wait for environment to connect. realtime_mode: Whether to render the environment window image and run environment at realtime. ''' self._obstacle_tower_env = ObstacleTowerEnv(environment_filename, docker_training, worker_id, retro, timeout_wait, realtime_mode) if floor != 0: self._obstacle_tower_env.floor(floor) self._flattener = ActionFlattener([3, 3, 2, 3]) self._action_space = self._flattener.action_space self.mobilenet = mobilenet self.gray_scale = gray_scale if mobilenet: self.image_module = WrappedKerasLayer(retro, self.mobilenet) self._done = False if autoencoder: print("Loading autoencoder from {}".format(autoencoder)) self.autoencoder = build_autoencoder(autoencoder) print("Done.") else: self.autoencoder = None def action_spec(self): return self._action_spec def observation_spec(self): return self._observation_spec def gray_process_observation(self, observation): observation = (observation * 255).astype(np.uint8) obs_image = Image.fromarray(observation) obs_image = obs_image.resize((84, 84), Image.NEAREST) gray_observation = np.mean(np.array(obs_image), axis=-1, keepdims=True) gray_observation = (gray_observation / 255) # gray_observation = self.autoencoder.predict(gray_observation) return gray_observation def _preprocess_observation(self, observation): """ Re-sizes visual observation to 84x84 """ observation = (observation * 255).astype(np.uint8) obs_image = Image.fromarray(observation) obs_image = obs_image.resize((224, 224), Image.NEAREST) return np.array(obs_image) def reset(self): observation = self._obstacle_tower_env.reset() observation, key, time = observation self._done = False if self.mobilenet: if self.autoencoder: observation = self.autoencoder.predict(observation[None, :])[0] return self.image_module(self._preprocess_observation( observation)), observation, key, time elif self.gray_scale: gray_observation = self.gray_process_observation(observation) if self.autoencoder: gray_observation = self.autoencoder.predict( gray_observation[None, :])[0] return gray_observation, observation else: return self._preprocess_observation(observation), observation def step(self, action): #if self._done: # return self.reset() if action == 0: # forward action = [1, 0, 0, 0] elif action == 1: # rotate camera left action = [0, 1, 0, 0] elif action == 2: # rotate camera right action = [0, 2, 0, 0] elif action == 3: # jump forward action = [1, 0, 1, 0] # elif action == 5: # action = [2, 0, 0, 0] # elif action == 6: # action = [0, 0, 0, 1] # elif action == 7: # action = [0, 0, 0, 2] observation, reward, done, info = self._obstacle_tower_env.step(action) observation, key, time = observation self._done = done if self.mobilenet: if self.autoencoder: observation = self.autoencoder.predict(observation[None, :])[0] return (self.image_module( self._preprocess_observation(observation)), reward, done, info), observation, key, time elif self.gray_scale: gray_observation = self.gray_process_observation(observation) if self.autoencoder: gray_observation = self.autoencoder.predict( gray_observation[None, :])[0] return (gray_observation, reward, done, info), observation else: return (self._preprocess_observation(observation), reward, done, info), observation def close(self): self._obstacle_tower_env.close() def floor(self, floor): self._obstacle_tower_env.floor(floor)
env.seed(i) obs = env.reset() reward = 0 actions = [] rerun_actions = False while reward < 1: if (i_path < len(seed_paths)): while i_path < len(seed_paths): current_path = paths[int(seed_paths[int(i_path)])] while path_i < len(current_path): current_action = int(current_path[path_i]) observation = process_image(obs) total_count += 1 x.append([str(ob) for ob in observation]) y.append(action_map[str(current_action)]) obs, step_reward, done, info = env.step(current_action) reward += step_reward path_i += 1 i_path += 1 path_i = 0 print("x", np.array(x).shape) print("y", np.array(y).shape) x_data = open('./data/x_data_' + str(i), 'w') x_data = open('./data/x_data_' + str(i), 'a') for xi in x: x_data.write(' '.join(xi) + '\n') y_data = open('./data/y_data_' + str(i), 'w') y_data = open('./data/y_data_' + str(i), 'a') for yi in y: y_data.write(str(yi) + ' ')
class Worker(threading.Thread): episode_count = 0 running_reward = 0 best_score = 0 global_steps = 0 save_lock = threading.Lock() def __init__(self, result_queue, params, save_dir): super(Worker, self).__init__() self.result_queue = result_queue self.save_dir = save_dir self.model_path = os.path.join(self.save_dir, 'model_a3c_distributed') self.env = ObstacleTowerEnv(params['env_path'], worker_id=1, retro=False, realtime_mode=False, greyscale=False, config=train_env_reset_config) self.action_size = params['action_size'] self._action_lookup = params['action_lookup'] self.input_shape = self.env.observation_space[0].shape # (84, 84, 3) self._last_health = 99999. self._last_keys = 0 self.global_model = params['global_model'] self.mirrored_strategy = tf.distribute.MirroredStrategy() with self.mirrored_strategy.scope(): # self.local_model = CNN(self.action_size, self.input_shape) self.local_model = CnnGru(self.action_size, self.input_shape) self.current_time = params['log_timestamp'] train_log_dir = './logs/' + self.current_time + '/worker_1' self.worker_summary_writer = tf.summary.create_file_writer( train_log_dir) self.timesteps = params['timesteps'] self.batch_size = params['batch_size'] self.gamma = params['gamma'] self.lr = params['lr'] self.opt = params['optimizer'] self.eps = np.finfo(np.float32).eps.item() self.ep_loss = 0.0 def get_updated_reward(self, reward, new_health, new_keys, done): new_health = float(new_health) if done: # penalize when game is terminated self._last_health = 99999. self._last_keys = 0 reward = -1 else: # crossing a floor- between [1, 4] if reward >= 1: reward += (new_health / 10000) # found time orb / crossed a floor if new_health > self._last_health: reward += 0.1 # found a key if new_keys > self._last_keys: reward += 0.1 return reward def log_worker_metrics(self, episode_reward, avg_reward, loss, step): with self.worker_summary_writer.as_default(): tf.summary.scalar('reward', episode_reward, step=step) tf.summary.scalar('moving_reward', avg_reward, step=step) tf.summary.scalar('loss', loss, step=step) self.worker_summary_writer.flush() def run(self): mem = Memory() rewards = [] ep_count = 1 timestep = 0 entropy_term = 0 ep_reward = 0. ep_steps = 0 ep_loss = 0. done = False obs = self.env.reset() state, _, _, _ = obs while timestep <= self.timesteps: with tf.GradientTape() as tape: for i in range(self.batch_size): # collect experience # get action as per policy state = tf.convert_to_tensor(state) state = tf.expand_dims(state, axis=0) action_probs, critic_value = self.local_model( state, training=True) entropy = -np.sum(action_probs * np.log(action_probs)) entropy_term += entropy # choose most probable action action_index = np.random.choice(self.action_size, p=np.squeeze(action_probs)) action = self._action_lookup[action_index] # perform action in game env for i in range(4): # frame skipping obs, reward, done, _ = self.env.step(action) state, new_keys, new_health, cur_floor = obs reward = self.get_updated_reward( reward, new_health, new_keys, done) self._last_health = new_health self._last_keys = new_keys ep_reward += reward ep_steps += 1 timestep += 1 # store experience mem.store(action_prob=action_probs[0, action_index], value=critic_value[0, 0], reward=reward) if done: break # backpropagation total_loss = self.local_model.compute_loss( mem, state, done, self.gamma, self.eps, entropy_term) ep_loss += total_loss Worker.global_steps += ep_steps grads = tape.gradient(total_loss, self.local_model.trainable_variables ) # calculate local gradients # self.opt.apply_gradients(zip(grads, self.global_model.trainable_variables)) # send local gradients to global model # self.local_model.set_weights(self.global_model.get_weights()) # update local model with new weights mem.clear() if done: rewards.append(ep_reward) Worker.running_reward = sum(rewards[-10:]) / 10 self.log_worker_metrics(ep_reward, Worker.running_reward, ep_loss, ep_count) print( "Episode: {} | Average Reward: {:.3f} | Episode Reward: {:.3f} | Loss: {:.3f} | Steps: {} | Total Steps: {} | Worker: {}" .format(Worker.episode_count, Worker.running_reward, ep_reward, ep_loss, ep_steps, Worker.global_steps, 1)) self.result_queue.put((Worker.running_reward, total_loss)) Worker.episode_count += 1 ep_count += 1 obs = self.env.reset() state, _, _, _ = obs # use a lock to save local model and to print to prevent data races. if ep_reward > Worker.best_score: with Worker.save_lock: print( '\nSaving best model to: {}, episode score: {}\n'. format(self.model_path, ep_reward)) keras.models.save_model(self.local_model, self.model_path) Worker.best_score = ep_reward entropy_term = 0 ep_reward = 0. ep_steps = 0 ep_loss = 0. keras.models.save_model(self.local_model, self.model_path) self.result_queue.put(None) self.env.close()
# tower 0, floor 10 = second room holds key config = { 'tower-seed': 0, 'starting-floor': 10, 'dense-reward': 1, 'agent-perspective': 1, 'allowed-rooms': 1, 'allowed-modules': 0, 'allowed-floors': 0 } obs = env.reset(config=config) next_observe = obs for _ in range(random.randint(1, 20)): observe = next_observe next_observe, _, _, _ = env.step(1) state = pre_processing(next_observe, observe) history = np.stack((state, state, state, state), axis=2) history = np.reshape([history], (1, 84, 84, 4)) print(history) while not done: env.render() step += 1 observe = next_observe action = agent.get_action(history) print("Action:" + str(action)) fake_action = action
def main(): #Load parse parameters #parser = otc_arg_parser() #args = parser.parse_args() #Challenge environment # if args.env == 'ObtRetro-v6': # env = ObstacleTowerEnv( # '/home/home/Data/Carmen/py_workspace/ObstacleTower_v3/ObstacleTower-v3.1/obstacletower.x86_64', # timeout_wait=6000, # retro=args.retro, # realtime_mode=args.test) # env = RetroWrapper(env, args.sample_normal) # env = OTCPreprocessing(env, args.action_reduction) # # if show_obs: # # env = RenderObservations(env) # # env = KeyboardControlWrapper(env) # else: env = ObstacleTowerEnv( '/home/home/Data/Carmen/py_workspace/ObstacleTower_v3/ObstacleTower-v3.1/obstacletower.x86_64', retro=args.retro, realtime_mode=args.test, timeout_wait=6000) #env = ObstacleTowerEnv('OBSTACLE_TOWER_PATH', retro=args.retro, realtime_mode=args.test, timeout_wait=6000) #Dict of actions created by the ObstacleTowerEnv Class of obstacle_tower_env library #print("ACTIONS:", env._flattener.action_lookup) print('FEATURES :', args.features) #Preprocess the environment (Grey Scales and action space reduction) env = OTCPreprocessing(env, args.action_reduction, args.features) env = DummyVecEnv([lambda: env]) #env = VecEnv(1, env.observation_space, env.action_space) print("ACTION SPACE ///////////:", env.action_space) print("OBSERVATION SPACE ///////////////:", env.observation_space) #env = make_vec_env(env, n_envs=4) ########Training######## #Study of the impact of different values of the PPO params if args.study: params_test(MlpPolicy, env) #If no Study Mode else: #If no Test Mode if not args.test: seed = random.seed(0) if args.pretrained_model: t = 300000 model = PPO2.load(args.pretrained_model, env=env, tensorboard_log=args.tensorboard_logdir) else: t = 0 #If Generalized Advantage Estimator is used if args.use_gae: model = PPO2(MlpPolicy, env, n_steps=args.num_steps, verbose=1, tensorboard_log=args.tensorboard_logdir, cliprange=args.clip_param, learning_rate=args.lr, ent_coef=args.entropy_coef, vf_coef=args.value_loss_coef, max_grad_norm=args.max_grad_norm, gamma=args.gamma, lam=args.gae_lambda, noptepochs=args.ppo_epoch, seed=seed) #If Generalized Advantage Estimator is not used else: model = PPO2(MlpPolicy, env, n_steps=args.num_steps, verbose=1, tensorboard_log=args.tensorboard_logdir, cliprange=args.clip_param, learning_rate=args.lr, ent_coef=args.entropy_coef, vf_coef=args.value_loss_coef, max_grad_norm=args.max_grad_norm, gamma=args.gamma, noptepochs=args.ppo_epoch, seed=seed) else: model = PPO2.load(args.pretrained_model, env=env) #model.learn(total_timesteps=50000) #model.save("ObstacleTower_prueba") filename = 'argsparams.txt' os.makedirs(args.results_dir, exist_ok=True) myfile = open(args.results_dir + filename, 'a') myfile.write( 'clip range: %f \n learning rate: %f \n coeficiente de entropía: %f \n coeficiente de pérdida: %f \n ' 'máximo gradiente: %f \n gamma: %f \n ppo epoch: %f \n' % (args.clip_param, args.lr, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.gamma, args.ppo_epoch)) myfile.close() if not args.test: while t < args.num_env_steps: #TRAIN MODEL if t == 0: model.learn(total_timesteps=args.eval_interval) else: model.learn(total_timesteps=args.eval_interval, reset_num_timesteps=False) os.makedirs(GLOBAL_PATH, exist_ok=True) print("Saving in '" + GLOBAL_PATH + "'") model.save(GLOBAL_PATH + args.training_name + "_" + str(int(t)).zfill(10)) avg_reward, avg_floor = test( t, model, env=env, global_path=args.results_dir) # Test log('T = ' + str(t) + ' / ' + str(args.num_env_steps) + ' | Avg. reward: ' + str(avg_reward) + ' | Avg. floor: ' + str(avg_floor)) t += args.eval_interval else: obs = env.reset() t = 0 while t < args.num_env_steps: action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) #print('action :', info) env.render('rgb_array')
import os import random from obstacle_tower_env import ObstacleTowerEnv counter = {} env = ObstacleTowerEnv(os.environ['OBS_TOWER_PATH'], worker_id=2) while True: env.seed(random.randrange(100)) env.reset() for _ in range(50): obs, _, _, _ = env.step(0) key = str(obs.flatten().tolist()) counter[key] = True print('got %d start states' % len(counter))
21, 24, 24, 24, 24, 24, 24, 18, 18, 30, 30, 30, 30, 30, 30, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 18, 18, 30, 30, 30, 24, 24, 24, 30, 30, 30, 30, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 30, 30, 30, 18, 24, 24, 24, 24, 18, 18, 18, 18, 30, 30, 30, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 18, 30, 30, 30, 24, 24, 18, 30, 30, 30, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 18, 30, 30, 18, 18, 24, 24, 24, 18, 18, 18, 18, 30, 30, 24, 24, 24, 24, 18, 18, 30, 30, 30, 30, 30, 30, 30, 18, 18, 18, 30, 30, 30, 30, 30, 30, 30, 30, 18, 24, 18, 24, 30, 30, 18, 18, 18, 24, 24, 18, 30, 30, 30, 24, 24, 24, 24, 24, 30, 30, 30, 30, 24, 24, 30, 30, 24, 18, 21, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 18, 18, 18, 18, 24, 30, 18, 24, 30, 24, 24, 24, 30, 30, 30, 30, 30, 24, 24, 24, 24, 24, 24, 24, 18, 18, 21, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 30, 30, 30, 30, 30, 30, 30, 18, 18 ]: env.step(action) env.seed(58) env.floor(10) env.reset() for action in [ 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 30, 30, 30, 30, 18, 18, 18, 18, 30, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 24, 24, 24, 18, 18, 18, 18, 18, 18, 18, 18, 30, 30, 30, 24, 30, 30, 30, 30, 30, 30, 30, 24, 24, 30, 30, 30, 30, 24, 24, 24, 24, 30, 30, 30, 30, 30, 30, 30, 24, 18, 18, 18, 18, 18, 24, 21, 18, 30, 30, 24, 18, 18, 18, 30, 30, 30, 30, 30, 30, 30, 30, 24, 24, 24, 30, 30, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 30, 24, 24, 24, 24, 18, 18, 18, 18, 18, 24, 18, 18, 24, 18, 18, 18, 18, 18, 18, 24, 18, 18, 18, 18, 24, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 24, 24, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 6, 6, 6, 6, 6, 6,
class WrappedObstacleTowerEnv(): def __init__( self, environment_filename=None, docker_training=False, worker_id=0, retro=False, timeout_wait=3000, realtime_mode=False, num_actions=3, stack_size=4, mobilenet=False, gray_scale=False, floor=0, visual_theme=0 ): ''' Arguments: environment_filename: The file path to the Unity executable. Does not require the extension. docker_training: Whether this is running within a docker environment and should use a virtual frame buffer (xvfb). worker_id: The index of the worker in the case where multiple environments are running. Each environment reserves port (5005 + worker_id) for communication with the Unity executable. retro: Resize visual observation to 84x84 (int8) and flattens action space. timeout_wait: Time for python interface to wait for environment to connect. realtime_mode: Whether to render the environment window image and run environment at realtime. ''' self._obstacle_tower_env = ObstacleTowerEnv(environment_filename, docker_training, worker_id, retro, timeout_wait, realtime_mode) if floor is not 0: self._obstacle_tower_env.floor(floor) self.start_floor = floor self.current_floor = floor self.mobilenet = mobilenet self.gray_scale = gray_scale self.retro = retro if mobilenet: self.state_size = [1280] elif gray_scale: self.state_size = [84, 84, 1] elif retro: self.state_size = [84, 84, 3] else: self.state_size = [168, 168, 3] self.stack_size = stack_size self.stack = [np.random.random(self.state_size).astype(np.float32) for _ in range(self.stack_size)] self.total_reward = 0 self.current_reward = 0 self.max_floor = 25 self.visual_theme = visual_theme self.id = worker_id def gray_preprocess_observation(self, observation): ''' Re-sizes obs to 84x84 and compresses to grayscale ''' observation = (observation * 255).astype(np.uint8) obs_image = Image.fromarray(observation) obs_image = obs_image.resize((84, 84), Image.NEAREST) gray_observation = np.mean(np.array(obs_image),axis=-1,keepdims=True) return gray_observation / 255 def mobile_preprocess_observation(self, observation): """ Re-sizes obs to 224x224 for mobilenet """ observation = (observation * 255).astype(np.uint8) obs_image = Image.fromarray(observation) obs_image = obs_image.resize((224, 224), Image.NEAREST) return self.mobilenet(np.array(obs_image)) def reset(self): # Reset env, stack and floor # (We save state as an attribute so child objects can access it) config = {"total-floors": 15} self.state = self._obstacle_tower_env.reset(config) self.state, reward, done, info = self._obstacle_tower_env.step(18) self.current_floor = self.start_floor self.stack = [np.random.random(self.state_size).astype(np.float32) for _ in range(self.stack_size)] self.total_reward = 0 self.current_reward = 0 # Preprocess current obs and add to stack if self.retro: observation = (self.state / 255).astype(np.float32) else: observation, key, time = self.state if self.mobilenet: observation = self.mobile_preprocess_observation(observation) elif self.gray_scale: observation = self.gray_preprocess_observation(observation) self.stack = self.stack[1:] + [observation] # Build our state (MUST BE A TUPLE) #one_hot_floor = tf.one_hot(self.current_floor, self.max_floor).numpy() one_hot_floor = np.zeros(self.max_floor) one_hot_floor[self.current_floor] += 1 floor_data = np.append(one_hot_floor, self.current_reward).astype(np.float32) stacked_state = np.concatenate(self.stack, axis=-1).astype(np.float32) if self.retro is True: ret_state = (stacked_state, floor_data) else: # Clip time to 2000, then normalize time = (2000. if time > 2000 else time) / 2000. key_time_data = np.array([key, time]).astype(np.float32) #key_time_data = np.array([key]).astype(np.float32) ret_state = (stacked_state, floor_data, key_time_data) return ret_state, info def step(self, action): # Convert int action to vector required by the env if self.retro: if action == 0: # forward action = 18 elif action == 1: # rotate camera left action = 24 elif action == 2: # rotate camera right action = 30 elif action == 3: # jump forward action = 21 elif action == 4: action = 6 elif action == 5: action = 12 else: if action == 0: # forward action = [1, 0, 0, 0] elif action == 1: # rotate camera left action = [1, 1, 0, 0] elif action == 2: # rotate camera right action = [1, 2, 0, 0] elif action == 3: # jump forward action = [1, 0, 1, 0] # Take the step and record data # (We save state as an attribute so child objects can access it) self.state, reward, done, info = self._obstacle_tower_env.step(action) # Keep track of current floor reward and total reward if reward >= 0.95: self.current_floor += 1 self.current_reward = 0 done = True else: self.current_reward += reward self.total_reward += reward if (done and reward < 0.95) or self.current_floor == 15: # Save info and reset when an episode ends info["episode_info"] = {"floor": self.current_floor, "total_reward": self.total_reward} ret_state, _ = self.reset() else: # Preprocess current obs and add to stack if self.retro: observation = (self.state / 255).astype(np.float32) else: observation, key, time = self.state if self.mobilenet: observation = self.mobile_preprocess_observation(observation) elif self.gray_scale: observation = self.gray_preprocess_observation(observation) self.stack = self.stack[1:] + [observation] # Build our state (MUST BE A TUPLE) #one_hot_floor = tf.one_hot(self.current_floor, self.max_floor).numpy() one_hot_floor = np.zeros(self.max_floor) one_hot_floor[self.current_floor] += 1 floor_data = np.append(one_hot_floor, self.current_reward).astype(np.float32) stacked_state = np.concatenate(self.stack, axis=-1).astype(np.float32) if self.retro is True: ret_state = (stacked_state, floor_data) else: # Clip time to 2000, then normalize time = (2000. if time > 2000 else time) / 2000. key_time_data = np.array([key, time]).astype(np.float32) #key_time_data = np.array([key]).astype(np.float32) ret_state = (stacked_state, floor_data, key_time_data) return ret_state, reward, done, info def close(self): self._obstacle_tower_env.close() def floor(self, floor): self._obstacle_tower_env.floor(floor) self.start_floor = floor