コード例 #1
0
 def testing(self):
     from keepitpossible.common import action_table
     self.table_action = action_table.create_action_table()
     self.MODEL.load()
     done = False
     reward = 0.0
     env = ObstacleTowerEnv(environment_filename=self.SCHEDULE.ENV_PATH,
                            worker_id=self.SCHEDULE.N_WORKER + 1,
                            retro=False,
                            realtime_mode=True)
     obs = env.reset()
     previous_preprocessed_observation_image = obs[0]
     while not done:
         action = self.MODEL.choose_action(
             previous_preprocessed_observation_image)
         # 做出動作,獲得場景資訊,已過關數,代理資訊
         for _ in self.table_action[int(action)]:
             observation, reward, done, info = env.step(_)
             print(
                 "Action_Chose: ",
                 action,
                 "Action: ",
                 _,
                 " Reward: ",
                 reward)
             if done:
                 break
         # 預處理模型需要的資料
         observation_image, keys, time_remaining = observation
         preprocessed_observation_image = observation_image
         previous_preprocessed_observation_image = preprocessed_observation_image
     env.close()
コード例 #2
0
def main():
    if len(sys.argv) != 2:
        sys.stderr.write('Usage: record_improve.py <recording_path>\n')
        os.exit(1)
    rec = Recording(sys.argv[1])
    env = ObstacleTowerEnv(os.environ['OBS_TOWER_PATH'],
                           worker_id=random.randrange(11, 20))
    try:
        env.seed(rec.seed)
        if rec.floor:
            env.floor(rec.floor)
        env.reset()
        i = 0
        for i, (action, rew) in enumerate(zip(rec.actions, rec.rewards)):
            _, real_rew, done, _ = env.step(action)
            if not np.allclose(real_rew, rew):
                print('mismatching result at step %d' % i)
                sys.exit(1)
            if done != (i == rec.num_steps - 1):
                print('invalid done result at step %d' % i)
                sys.exit(1)
        print('match succeeded')
    finally:
        env.close()
コード例 #3
0
    # 試跑
    env = ObstacleTowerEnv('./ObstacleTower/obstacletower.exe',
                           worker_id=10,
                           retro=False,
                           realtime_mode=True)
    obs = env.reset()
    print("執行測試環境,如果要離開請按Q")
    previous_preprocessed_observation_image = np.reshape(obs[0], [-1])
    while True:
        action = GLOBAL_KPRUN.choose_action(
            previous_preprocessed_observation_image)
        # 多執行緒會有跑不動的問題
        if np.isnan(action):
            action = np.random.randint(6, high=12)
        # 做出動作,獲得場景資訊,已過關數,代理資訊
        observation, reward, done, info = env.step(
            np.array(GLOBAL_KPRUN.tableAction[int(action)]))
        # 預處理模型需要的資料
        observation_image, keys, time_remaining = observation
        preprocessed_observation_image = np.reshape(observation_image, [-1])
        if 0xFF == ord('q'):
            break
        previous_preprocessed_observation_image = preprocessed_observation_image
    env.close()

if __name__ == '__main__':
    # 建立物件
    GLOBAL_KPRUN = MODEL()
    # GLOBAL_KPRUN.load()
    # 建立多執行緒
    UPDATE_EVENT, ROLLING_EVENT = threading.Event(), threading.Event()
    # 現在不更新
コード例 #4
0
class Worker(object):
    def __init__(self,
                 envpath,
                 wid,
                 retro,
                 realtime_mode,
                 env_seed=0,
                 env_floor=0):
        self.wid = wid
        self.env = ObstacleTowerEnv(environment_filename=envpath,
                                    worker_id=wid,
                                    retro=retro,
                                    realtime_mode=realtime_mode)
        self.kprun = GLOBAL_KPRUN
        self.tableAction = self.createActionTable()
        # 設定關卡
        self.env_seed = env_seed
        self.env_floor = env_floor
        self.step = 0
        self.summary = tf.Summary(value=[
            tf.Summary.Value(tag="Stage_reward " + str(self.wid),
                             simple_value=0)
        ])
        self.kprun.train_writer.add_summary(self.summary, 0)

    def createActionTable(self):
        tableAction = []
        for a in range(0, 3):
            for b in range(0, 3):
                for c in range(0, 2):
                    tableAction.append([a, b, c, 0])
        # print("Action option: ", tableAction[0:17])
        return tableAction

    def reward_compute(self, done, reward_total, keys, previous_keys, reward,
                       previous_reward, time_remaining,
                       previous_time_remaining, previous_stage_time_remaining):
        # 定義獎勵公式
        # reward 是從環境傳來的破關數
        # keys 是撿到鑰匙的數量
        # time_remaining 是剩餘時間
        # 過關最大獎勵為10
        # 一把鑰匙為5
        # 時間果實暫時只給0.5,因為結束會結算剩餘時間,會有獎勵累加的問題。
        # 如果過關,給予十倍過關獎勵 - (場景開始的時間-剩餘時間)/1000
        # print("time_remaining ", time_remaining,
        #       " previous_time_remaining ", previous_time_remaining,
        #         " reward ", reward)
        # 通過一個會開門的綠門會加0.1
        if (reward - previous_reward) > 0 and (reward - previous_reward) < 0.3:
            reward_total += 3
        elif (reward - previous_reward) > 0.9:
            # ***如果剩餘時間比場景時間多會變成加分獎勵,可能會極大增加Agent吃時間果實的機率。
            # ***另一種方式是剩餘的時間直接/1000加上去,這樣就沒有累加效果。
            print("Pass ", reward, " Stage!")
            # reward_total += (reward - previous_reward) * 100 - \
            #                 (previous_stage_time_remaining - time_remaining)

            reward_total += 200
            # 過關之後把時間留到下一關,儲存這回合時間供下次計算過關使用
            previous_time_remaining = time_remaining
            previous_stage_time_remaining = time_remaining
            # Lesson 1 repeat
            if reward > 6.5:
                # self.total_step +=1
                # if self.total_step >=5:
                #     done = True
                #     return reward_total, previous_stage_time_remaining, done
                self.env.seed(np.random.randint(5))
                # env.reset()
                done = True
            return reward_total, previous_stage_time_remaining, done

        # 假設過關的時候有順便吃到果實或鑰匙,所以預設為同時可以加成
        if previous_keys > keys:
            # print("Get Key")
            reward_total += 5

        if previous_time_remaining < time_remaining and previous_time_remaining != 0:
            # print("Get time power up")
            reward_total += 2
        else:
            reward_total -= 0.5
        if done and previous_time_remaining > 100:
            print("Agent died")
            # 如果剩餘時間越多就掛點,扣更多
            # reward_total -= (10 + time_remaining / 100)
            reward_total -= 100
        return reward_total, previous_stage_time_remaining, done

    def work(self):
        global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER
        # 設定關卡
        self.env.seed(self.env_seed)
        self.env.floor(self.env_floor)
        # 只要還沒達到目標回合就LOOP
        while not COORD.should_stop():
            # 紀錄步數
            self.step += 1
            # 重設關卡
            obs = self.env.reset()
            # 初始化
            done = False
            stage_reward = 0.0
            reward = 0
            keys = 0
            # 檢查是否有吃到加時間的,如果是第一回合出來沒有time_remaining,事先定義
            time_remaining = 3000
            previous_stage_time_remaining = time_remaining
            # 預處理圖像
            # previous_preprocessed_observation_image = np.reshape(obs[0], [-1])
            previous_preprocessed_observation_image = obs[0]
            buffer_s, buffer_a, buffer_r = [], [], []
            # 只要沒死
            while not done:
                # 如果模型正在更新就等待更新完成
                if not ROLLING_EVENT.is_set():
                    # 等待更新完成
                    ROLLING_EVENT.wait()
                    # 清除記憶體,使用新的代理收集資料
                    buffer_s, buffer_a, buffer_r = [], [], []

                # 儲存上一個動作狀態,供計算獎勵用
                previous_keys = keys
                previous_reward = reward
                previous_time_remaining = time_remaining

                # 根據上一次的狀態決定動作
                action = self.kprun.choose_action(
                    previous_preprocessed_observation_image)
                action = np.clip(np.random.normal(action, 1.), *[6, 12])

                # 做出動作,獲得場景資訊,已過關數,代理資訊
                observation, reward, done, info = self.env.step(
                    np.array(self.tableAction[int(action)]))

                # 預處理模型需要的資料
                observation_image, keys, time_remaining = observation
                # preprocessed_observation_image = np.reshape(
                #     observation_image, [-1])
                preprocessed_observation_image = observation_image
                stage_reward, previous_stage_time_remaining, done = self.reward_compute(
                    done=done,
                    reward_total=stage_reward,
                    keys=keys,
                    previous_keys=previous_keys,
                    reward=reward,
                    previous_reward=previous_reward,
                    time_remaining=time_remaining,
                    previous_time_remaining=previous_time_remaining,
                    previous_stage_time_remaining=previous_stage_time_remaining
                )
                # Normalize reward~不知道中文怎麼打
                stage_reward = stage_reward + 8 / 8

                # 把這次狀態存入 記憶體
                buffer_s.append(np.array([preprocessed_observation_image]))
                buffer_a.append(action)
                buffer_r.append(stage_reward)

                # 儲存下一步要參考的圖像
                previous_preprocessed_observation_image = preprocessed_observation_image

                # 達到更新時,自己先做處理。
                GLOBAL_UPDATE_COUNTER += 1
                # 太多自己就先處理更新
                if len(buffer_s) == EP_LEN - \
                        1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
                    v_s_ = self.kprun.get_v(preprocessed_observation_image)
                    # 計算折扣獎勵
                    discounted_r = []
                    for r in buffer_r[::-1]:
                        v_s_ = r + GAMMA * v_s_
                        discounted_r.append(v_s_)
                    discounted_r.reverse()
                    # 整理維度
                    bs, ba, br = np.vstack(buffer_s), np.vstack(
                        buffer_a), np.array(discounted_r)[:, np.newaxis]
                    # 把資料放入共享記憶體
                    QUEUE.put(bs)
                    QUEUE.put(ba)
                    QUEUE.put(br)
                    # print("len(buffer_s)", len(buffer_s))
                    # print("bs.shape", bs.shape)
                    # 清空暫存
                    buffer_s, buffer_a, buffer_r = [], [], []
                    # 如果整個模型步數到達最小BATCH 就整個更新
                    if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
                        # 停止收集資料
                        ROLLING_EVENT.clear()
                        # 更新PPO
                        UPDATE_EVENT.set()
                    # 達到最多EP停止訓練
                    if GLOBAL_EP >= EP_MAX:
                        COORD.request_stop()
                        break
            # 紀錄獎勵
            self.summary = tf.Summary(value=[
                tf.Summary.Value(tag="Stage_reward " + str(self.wid),
                                 simple_value=stage_reward)
            ])
            self.kprun.train_writer.add_summary(self.summary, self.step)
            GLOBAL_EP += 1
            print(
                '{0:.1f}%'.format(GLOBAL_EP / EP_MAX * 100),
                '|W%i' % self.wid,
                '|Ep_r: %.2f' % stage_reward,
            )
        self.env.close()
コード例 #5
0
ファイル: a2c.py プロジェクト: marykln/RL4OTC
        }
        obs = env.reset(config=config)
        np.trim_zeros(steps)

        def handcrafted_step(act, go):
            try:
                for i in range(go):
                    env.step(act)
            except IndexError:
                pass

        handcrafted_step(18, steps[0])
        handcrafted_step(6, steps[1])
        handcrafted_step(18, steps[2])

        s, reward, done, info = env.step(18)
        s = rgb2gray(s)
        s = np.expand_dims(s, axis=2)

        t = 0
        track_score = []
        track_r = []
        track_a = []
        track_s = []
        bad_seq = 0
        score = 0
        steps_after_key = 0
        while True:
            if RENDER: env.render()

            a = actor.choose_action(
コード例 #6
0
def main():
    basicConfig(level=INFO)
    env = ObstacleTowerEnv(str(PRJ_ROOT / 'obstacletower'), retro=False, worker_id=9)
    done = False
    env.floor(1)
    env.reset()

    screen = Screen()
    random_actor = RandomRepeatActor(continue_rate=0.9)
    random_actor.reset(schedules=[
        (Action.CAMERA_RIGHT, 3),
        (Action.CAMERA_LEFT, 6),
        (Action.CAMERA_RIGHT, 3),
        (Action.NOP, 5),
        (Action.FORWARD, 8),
        (Action.RIGHT, 2),
        (Action.LEFT, 4),
        (Action.RIGHT, 2),
    ])

    frame_history = FrameHistory(env)
    moving_checker = MovingChecker(frame_history)
    position_estimator = PositionEstimator(moving_checker)
    map_observation = MapObservation(position_estimator, moving_checker)
    event_handlers: List[EventHandler] = [
        frame_history,
        moving_checker,
        position_estimator,
        map_observation,
    ]

    while not done:
        for h in event_handlers:
            h.begin_loop()

        screen.show("original", frame_history.last_frame)
        cv2.waitKey(0)

        for h in event_handlers:
            h.before_step()

        action = random_actor.decide_action(moving_checker.did_move)
        obs, reward, done, info = env.step(action)
        if reward != 0:
            logger.info(f"Get Reward={reward} Keys={obs[1]}")
        # logger.info(f"Keys={obs[1]} Time Remain={obs[2]}")

        params = EventParamsAfterStep(action, obs, reward, done, info)
        for h in event_handlers:
            h.after_step(params)

        screen.show("map", map_observation.concat_images())

        if len(frame_history.small_frame_pixel_diffs) > 0:
            f1 = frame_history.small_frame_pixel_diffs[-1]
            if len(frame_history.small_frame_pixel_diffs) > 1:
                f2 = frame_history.small_frame_pixel_diffs[-2]
                f1 = np.concatenate((f2, f1), axis=1)
            screen.show("diff", f1)

        for h in event_handlers:
            h.end_loop()
コード例 #7
0
ファイル: runner.py プロジェクト: Maggern3/SAC
state = env.reset()
#print(state.shape)
state = state[0]
state = TF.to_tensor(state)
print(state.size)
scores = []
mean_scores_100 = deque(maxlen=100)
version = 'v3'
for episode in range(400):
    timesteps = 0
    rewards = 0
    for steps in range(10000):
        timesteps += 1
        actions, actions_env_format = agent.select_actions(state)
        next_state, reward, done, info = env.step(actions_env_format)
        next_state = next_state[0]
        next_state = TF.to_tensor(next_state)
        agent.replay_buffer.add((state, actions, reward, next_state, done))    
        agent.train()
        state = next_state
        rewards += reward
        if(done):
            break
    scores.append(rewards)
    mean_scores_100.append(rewards)
    print('episode {} frames {} rewards {:.2f} mean score {:.2f}'.format(episode, timesteps, rewards, np.mean(mean_scores_100)))
    if(episode % 100 == 0):
        torch.save(agent.conv_net.state_dict(), 'checkpoints/conv_net_checkpoint_{}.pth'.format(version))
        torch.save(agent.critic_v.state_dict(), 'checkpoints/critic_v_checkpoint_{}.pth'.format(version))
        torch.save(agent.critic_q_1.state_dict(), 'checkpoints/critic_q_1_checkpoint_{}.pth'.format(version))
コード例 #8
0
# Interacting with the environment

obs = env.reset()
plt.imshow(obs[0])

# Get action meanings
print('Table of actions')
for action_id, action_meaning in enumerate(env.get_action_meanings()):
    print(action_id, action_meaning)

import signal


def env_closer(signo, handler):
    print('Closing the environment...')
    import sys
    sys.exit(1)
    env.close()


signal.signal(signal.SIGINT, env_closer)

while True:
    sampled_action = env.action_space.sample()
    print('Sampled action:', sampled_action)

    obs, reward, done, info = env.step(sampled_action)
    plt.imshow(obs[0])
    print('Reward after action', reward)
コード例 #9
0
class RandomAgent:
    """Random Agent that will play the specified game
      Args:
        env_name: Name of the environment to be played
        max_eps: Maximum number of episodes to run agent for.
    """
    def __init__(self,
                 env_path,
                 train=False,
                 evaluate=False,
                 eval_seeds=[],
                 max_eps=100,
                 save_dir=None,
                 plot=False):
        if train:
            self.env = ObstacleTowerEnv(env_path,
                                        worker_id=0,
                                        retro=False,
                                        realtime_mode=False,
                                        config=train_env_reset_config)
        else:
            if evaluate:
                self.env = ObstacleTowerEnv(env_path,
                                            worker_id=0,
                                            retro=False,
                                            realtime_mode=False,
                                            config=eval_env_reset_config)
                self.env = ObstacleTowerEvaluation(self.env, eval_seeds)
            else:
                self.env = ObstacleTowerEnv(env_path,
                                            worker_id=0,
                                            retro=False,
                                            realtime_mode=True,
                                            config=eval_env_reset_config)
        self.max_episodes = max_eps
        self.global_moving_average_reward = 0
        self.save_dir = save_dir
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        self.plot = plot
        self.res_queue = Queue()

    def train(self):
        start_time = time.time()
        reward_avg = 0
        global_steps = 0
        moving_average_rewards = []
        for episode in range(self.max_episodes):
            done = False
            self.env.reset()
            reward_sum = 0.0
            steps = 0
            while not done:
                # Sample randomly from the action space and step
                _, reward, done, _ = self.env.step(
                    self.env.action_space.sample())
                steps += 1
                global_steps += 1
                reward_sum += reward

            if self.plot:
                # Record statistics
                moving_average_rewards.append(reward_sum)

            reward_avg += reward_sum
            self.global_moving_average_reward = record(
                episode, reward_sum, 0, self.global_moving_average_reward,
                self.res_queue, 0, steps, global_steps)
        end_time = time.time()
        print("\nTraining complete. Time taken = {} secs".format(end_time -
                                                                 start_time))
        final_avg = reward_avg / float(self.max_episodes)
        print("Average score across {} episodes: {}".format(
            self.max_episodes, final_avg))

        if self.plot:
            plt.plot(moving_average_rewards)
            plt.ylabel('Moving average episode reward')
            plt.xlabel('Step')
            plt.savefig(
                os.path.join(self.save_dir, 'model_random_moving_average.png'))

        self.env.close()
        return final_avg

    def play_single_episode(self):
        action_space = ActionSpace()
        print("Playing single episode...")
        done = False
        step_counter = 0
        reward_sum = 0
        obs = self.env.reset()
        state, _, _, _ = obs

        try:
            while not done:
                action = self.env.action_space.sample()
                obs, reward, done, info = self.env.step(action)
                reward_sum += reward
                print("{}. Reward: {}, action: {}".format(
                    step_counter, reward_sum,
                    action_space.get_action_meaning(action)))
                step_counter += 1
        except KeyboardInterrupt:
            print("Received Keyboard Interrupt. Shutting down.")
        finally:
            if not self.evaluate:
                self.env.close()
            return reward_sum

    def evaluate(self):
        # run episodes until evaluation is complete
        while not self.env.evaluation_complete:
            episode_reward = self.play_single_episode()

        pprint(self.env.results)
        self.env.close()
コード例 #10
0
class Worker(threading.Thread):
    episode_count = 0
    mean_reward = 0
    best_score = 0
    global_steps = 0
    save_lock = threading.Lock()

    def __init__(self, result_queue, idx, save_dir, params):
        super(Worker, self).__init__()
        self.result_queue = result_queue
        self.worker_idx = idx
        self.save_dir = save_dir
        self.model_path = os.path.join(self.save_dir, 'model_a3c')

        self.env = ObstacleTowerEnv(params['env_path'],
                                    worker_id=self.worker_idx,
                                    retro=False,
                                    realtime_mode=False,
                                    greyscale=False,
                                    config=train_env_reset_config)

        self.action_size = params['action_size']
        self._action_lookup = params['action_lookup']
        self.input_shape = self.env.observation_space[0].shape  # (84, 84, 3)
        self._last_health = 99999.
        self._last_keys = 0

        self.global_model = params['global_model']
        # self.local_model = CNN(self.action_size, self.input_shape)
        self.local_model = CnnGru(self.action_size, self.input_shape)

        self.ac_ckpt = params['ckpt']
        self.ac_manager = params['ckpt_mgr']

        self.current_time = params['log_timestamp']
        train_log_dir = './logs/' + self.current_time + '/worker_' + str(
            self.worker_idx)
        self.worker_summary_writer = tf.summary.create_file_writer(
            train_log_dir)

        self.timesteps = params['timesteps']
        self.batch_size = params['batch_size']
        self.gamma = params['gamma']
        self.lr = params['lr']
        self.opt = params['optimizer']
        self.eps = np.finfo(np.float32).eps.item()

    def get_updated_reward(self, reward, new_health, new_keys, done):
        new_health = float(new_health)
        new_reward = 0.0
        if done:  # reset params when game is terminated
            self._last_health = 99999.
            self._last_keys = 0
        else:
            # opened a door, solved a puzzle, picked up a key
            if 0.1 <= reward < 1:
                new_reward += 0.5

            # crossing a floor - between [1, 4]
            if reward >= 1:
                new_reward += (new_health / 10000)

            # found time orb / crossed a floor
            if new_health > self._last_health:
                new_reward += 0.5

        return new_reward

    def log_worker_metrics(self, episode_reward, loss, step):
        with self.worker_summary_writer.as_default():
            with tf.name_scope('worker'):
                tf.summary.scalar('reward', episode_reward, step=step)
                tf.summary.scalar('loss', loss, step=step)
            self.worker_summary_writer.flush()

    def run(self):
        mem = Memory()
        ep_count = 0
        timestep = 0
        entropy_term = 0
        ep_reward = 0.
        ep_steps = 0
        ep_loss = 0.

        done = False
        obs = self.env.reset()
        state, self._last_keys, self._last_health, _ = obs

        while timestep <= self.timesteps:
            i = 0
            with tf.GradientTape() as tape:
                while i < self.batch_size:
                    # collect experience
                    # get action as per policy
                    state = tf.convert_to_tensor(state)
                    state = tf.expand_dims(state, axis=0)
                    action_probs, critic_value = self.local_model(
                        [state, float(self._last_health)], training=True)

                    entropy = -np.sum(action_probs * np.log(action_probs))
                    entropy_term += entropy

                    # choose most probable action
                    dist = tfp.distributions.Categorical(probs=action_probs,
                                                         dtype=tf.float32)
                    action_index = int(dist.sample().numpy())
                    action = self._action_lookup[action_index]

                    # perform action in game env
                    for i in range(4):  # frame skipping
                        obs, reward, done, _ = self.env.step(action)
                        state, new_keys, new_health, cur_floor = obs
                        reward = self.get_updated_reward(
                            reward, new_health, new_keys, done)
                        self._last_health = new_health
                        self._last_keys = new_keys
                        ep_reward += reward
                        ep_steps += 1
                        i += 1
                        timestep += 1

                    # store experience
                    mem.store(action_prob=tf.math.log(
                        action_probs[0, action_index]),
                              value=critic_value[0, 0],
                              reward=reward)

                    if done:
                        break

                # backpropagation
                total_loss = self.local_model.compute_loss(
                    mem, state, done, self.gamma, self.eps, entropy_term)
                ep_loss += total_loss
                Worker.global_steps += ep_steps

            grads = tape.gradient(total_loss,
                                  self.local_model.trainable_variables
                                  )  # calculate local gradients
            self.opt.apply_gradients(
                zip(grads, self.global_model.trainable_variables)
            )  # send local gradients to global model
            self.local_model.set_weights(self.global_model.get_weights(
            ))  # update local model with new weights
            mem.clear()

            if done:
                Worker.mean_reward = (Worker.mean_reward * Worker.episode_count
                                      + ep_reward) / (Worker.episode_count + 1)

                self.log_worker_metrics(ep_reward, ep_loss, ep_count)
                print(
                    "Episode: {} | Mean Reward: {:.3f} | Episode Reward: {:.3f} | Loss: {:.3f} | Steps: {} | Total Steps: {} | Worker: {}"
                    .format(Worker.episode_count, Worker.mean_reward,
                            ep_reward, ep_loss, ep_steps, Worker.global_steps,
                            self.worker_idx))
                self.result_queue.put((Worker.mean_reward, total_loss))
                Worker.episode_count += 1
                ep_count += 1

                obs = self.env.reset()
                state, _, _, _ = obs

                # use a lock to save local model and to print to prevent data races.
                if ep_reward > Worker.best_score:
                    with Worker.save_lock:
                        self.ac_manager.save()
                        print("Saved checkpoint for step {}".format(
                            int(self.ac_ckpt.step)))
                        self.ac_ckpt.step.assign_add(1)

                        keras.models.save_model(self.global_model,
                                                self.model_path)
                        print('\nSaved best model to: {}, episode score: {}\n'.
                              format(self.model_path, ep_reward))
                        Worker.best_score = ep_reward

                entropy_term = 0
                ep_reward = 0.
                ep_steps = 0
                ep_loss = 0.

        self.result_queue.put(None)
        self.env.close()
コード例 #11
0
config = {'tower-seed': 0, 'starting-floor': 10, 'agent-perspective': 0, 'allowed-rooms': 1, 'allowed-modules': 0, 'allowed-floors': 0}
obs = env.reset(config=config)

action = env.action_space.sample()
allowed_action = False
allowed_actions = np.array([np.array([1, 0, 0, 0]), np.array([0, 1, 0, 0]), np.array([0, 2, 0, 0]), np.array([1, 0, 1, 0])])

# Took only an action of the allowed actions
while not allowed_action:
    if (allowed_actions == action).all(1).any():
        allowed_action = True
    else:
        action = env.action_space.sample()

action = np.array([1, 0, 0, 0])
obs, reward, done, info = env.step(action)
obs, reward, done, info = env.step(action)
obs, reward, done, info = env.step(action)
obs, reward, done, info = env.step(action)
obs, reward, done, info = env.step(action)
obs, reward, done, info = env.step(action)
obs, reward, done, info = env.step(action)
obs, reward, done, info = env.step(action)
obs, reward, done, info = env.step(action)
obs, reward, done, info = env.step(action)
obs, reward, done, info = env.step(action)
obs, reward, done, info = env.step(action)
obs, reward, done, info = env.step(action)
obs, reward, done, info = env.step(action)
obs, reward, done, info = env.step(action)
obs, reward, done, info = env.step(action)
コード例 #12
0
    args = parser.parse_args()
    
    env = ObstacleTowerEnv(args.environment_filename, docker_training=args.docker_training, realtime_mode=True)

    model = get_model()
    optimizer = tf.train.AdamOptimizer()
    checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)
    checkpoint.restore(tf.train.latest_checkpoint('./tf_saves/'))

    total_count = 0
    for i in range(0, 101):
        #setup environment
        env.seed(i)
        obs = env.reset()
        reward = 0
        actions = []
        rerun_actions = False
        obs = env.reset()
        while True:
            observation = process_image(obs)
            prediction = model(tf.cast([observation], dtype=tf.float32))[0]
            print('prediction', prediction)
            selection = np.argmax(prediction)
            print('selection', selection)
            action = action_options[selection]
            print('action', action)
            obs, step_reward, done, info = env.step(action)


    env.close()
コード例 #13
0
class StableA2C():
    def __init__(self,
                 env_path,
                 train,
                 evaluate,
                 policy_name='CnnPolicy',
                 save_dir='./model_files/',
                 eval_seeds=[]):
        self.save_dir = save_dir
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        self.model_path = os.path.join(self.save_dir, 'model_stable_a2c')
        self.log_dir = './logs/stable_a2c'
        self.policy_name = policy_name
        self.evaluate = evaluate

        if train:
            self.env = ObstacleTowerEnv(env_path,
                                        worker_id=0,
                                        retro=True,
                                        realtime_mode=False,
                                        config=train_env_reset_config)
        else:
            if evaluate:
                self.env = ObstacleTowerEnv(env_path,
                                            worker_id=0,
                                            retro=True,
                                            realtime_mode=False,
                                            config=eval_env_reset_config)
                self.env = ObstacleTowerEvaluation(self.env, eval_seeds)
            else:
                self.env = ObstacleTowerEnv(env_path,
                                            worker_id=0,
                                            retro=True,
                                            realtime_mode=True,
                                            config=eval_env_reset_config)

    def load_model(self):
        print('Loading model from: {}'.format(self.model_path))
        model = A2C.load(self.model_path)
        model.set_env(self.env)
        model.tensorboard_log = self.log_dir
        return model

    def train(self, timesteps=10000, continue_training=False):
        start_time = time.time()
        if not continue_training:
            print("Initializing from scratch")
            model = A2C(self.policy_name,
                        self.env,
                        verbose=1,
                        tensorboard_log=self.log_dir)
        else:
            model = self.load_model()
            print("Restored from {}".format(self.model_path))

        model.learn(total_timesteps=timesteps)
        print('\nTraining complete. Time taken = {} secs'.format(time.time() -
                                                                 start_time))
        model.save(self.model_path)

    def play_single_episode(self):
        """ have the trained agent play a single game """
        action_space = ActionSpace()
        done = False
        reward_sum = 0
        step_counter = 0

        model = self.load_model()
        obs = self.env.reset()
        try:
            print("Playing single episode...")
            while not done:
                action, _states = model.predict(obs)
                obs, reward, done, info = self.env.step(action)
                print("{}. Reward: {}, action: {}".format(
                    step_counter, reward_sum,
                    action_space.get_full_action_meaning(action)))
                self.env.render()
                step_counter += 1
                reward_sum += reward
        except KeyboardInterrupt:
            print("Received Keyboard Interrupt. Shutting down.")
        finally:
            if not self.evaluate:
                self.env.close()
                print("Environment closed.")
            print("Game play completed.")
            return reward_sum

    def evaluate(self):
        """ run episodes until evaluation is complete """
        while not self.env.evaluation_complete:
            episode_reward = self.play_single_episode()

        pprint(self.env.results)
        self.env.close()
コード例 #14
0
ファイル: run_coach.py プロジェクト: pjrodrig/OTC-entry
 env.reset()
 reward = 0
 actions = []
 rerun_actions = False
 reward_total = 0
 while reward_total < 2:
     reward = 0
     if (i_path < len(seed_paths)):
         reward_before = reward_total
         while i_path < len(
                 seed_paths) and reward_total < reward_before + 1:
             current_path = paths[int(seed_paths[int(i_path)])]
             while path_i < len(
                     current_path) and reward_total < reward_before + 1:
                 current_action = int(current_path[path_i])
                 obs, reward, done, info = env.step(current_action)
                 reward_total += reward
                 print('loop reward', reward)
                 path_i += 1
             i_path += 1
             path_i = 0
     print('before if reward', reward)
     if (reward == 0):
         if rerun_actions:
             for action in actions:
                 env.step(action)
             rerun_actions = False
         else:
             print("left: 1, right: 2")
             action = input("action: ")
             if action == "restart":
コード例 #15
0
class WrappedObstacleTowerEnv():
    def __init__(self,
                 environment_filename=None,
                 docker_training=False,
                 worker_id=0,
                 retro=False,
                 timeout_wait=30,
                 realtime_mode=False,
                 num_actions=3,
                 mobilenet=False,
                 gray_scale=False,
                 autoencoder=None,
                 floor=0):
        '''
        Arguments:
          environment_filename: The file path to the Unity executable.  Does not require the extension.
          docker_training: Whether this is running within a docker environment and should use a virtual
            frame buffer (xvfb).
          worker_id: The index of the worker in the case where multiple environments are running.  Each
            environment reserves port (5005 + worker_id) for communication with the Unity executable.
          retro: Resize visual observation to 84x84 (int8) and flattens action space.
          timeout_wait: Time for python interface to wait for environment to connect.
          realtime_mode: Whether to render the environment window image and run environment at realtime.
        '''

        self._obstacle_tower_env = ObstacleTowerEnv(environment_filename,
                                                    docker_training, worker_id,
                                                    retro, timeout_wait,
                                                    realtime_mode)
        if floor != 0:
            self._obstacle_tower_env.floor(floor)
        self._flattener = ActionFlattener([3, 3, 2, 3])
        self._action_space = self._flattener.action_space
        self.mobilenet = mobilenet
        self.gray_scale = gray_scale
        if mobilenet:
            self.image_module = WrappedKerasLayer(retro, self.mobilenet)
        self._done = False
        if autoencoder:
            print("Loading autoencoder from {}".format(autoencoder))
            self.autoencoder = build_autoencoder(autoencoder)
            print("Done.")
        else:
            self.autoencoder = None

    def action_spec(self):
        return self._action_spec

    def observation_spec(self):
        return self._observation_spec

    def gray_process_observation(self, observation):
        observation = (observation * 255).astype(np.uint8)
        obs_image = Image.fromarray(observation)
        obs_image = obs_image.resize((84, 84), Image.NEAREST)
        gray_observation = np.mean(np.array(obs_image), axis=-1, keepdims=True)
        gray_observation = (gray_observation / 255)

        # gray_observation = self.autoencoder.predict(gray_observation)
        return gray_observation

    def _preprocess_observation(self, observation):
        """
        Re-sizes visual observation to 84x84
        """
        observation = (observation * 255).astype(np.uint8)
        obs_image = Image.fromarray(observation)
        obs_image = obs_image.resize((224, 224), Image.NEAREST)
        return np.array(obs_image)

    def reset(self):
        observation = self._obstacle_tower_env.reset()
        observation, key, time = observation
        self._done = False
        if self.mobilenet:
            if self.autoencoder:
                observation = self.autoencoder.predict(observation[None, :])[0]
            return self.image_module(self._preprocess_observation(
                observation)), observation, key, time
        elif self.gray_scale:
            gray_observation = self.gray_process_observation(observation)
            if self.autoencoder:
                gray_observation = self.autoencoder.predict(
                    gray_observation[None, :])[0]
            return gray_observation, observation
        else:
            return self._preprocess_observation(observation), observation

    def step(self, action):
        #if self._done:
        #    return self.reset()

        if action == 0:  # forward
            action = [1, 0, 0, 0]
        elif action == 1:  # rotate camera left
            action = [0, 1, 0, 0]
        elif action == 2:  # rotate camera right
            action = [0, 2, 0, 0]
        elif action == 3:  # jump forward
            action = [1, 0, 1, 0]
        # elif action == 5:
        #     action = [2, 0, 0, 0]
        # elif action == 6:
        #     action = [0, 0, 0, 1]
        # elif action == 7:
        #     action = [0, 0, 0, 2]

        observation, reward, done, info = self._obstacle_tower_env.step(action)
        observation, key, time = observation
        self._done = done

        if self.mobilenet:
            if self.autoencoder:
                observation = self.autoencoder.predict(observation[None, :])[0]
            return (self.image_module(
                self._preprocess_observation(observation)), reward, done,
                    info), observation, key, time
        elif self.gray_scale:
            gray_observation = self.gray_process_observation(observation)
            if self.autoencoder:
                gray_observation = self.autoencoder.predict(
                    gray_observation[None, :])[0]
            return (gray_observation, reward, done, info), observation
        else:
            return (self._preprocess_observation(observation), reward, done,
                    info), observation

    def close(self):
        self._obstacle_tower_env.close()

    def floor(self, floor):
        self._obstacle_tower_env.floor(floor)
コード例 #16
0
ファイル: run_data.py プロジェクト: pjrodrig/OTC-entry
        env.seed(i)
        obs = env.reset()
        reward = 0
        actions = []
        rerun_actions = False
        while reward < 1:
            if (i_path < len(seed_paths)):
                while i_path < len(seed_paths):
                    current_path = paths[int(seed_paths[int(i_path)])]
                    while path_i < len(current_path):
                        current_action = int(current_path[path_i])
                        observation = process_image(obs)
                        total_count += 1
                        x.append([str(ob) for ob in observation])
                        y.append(action_map[str(current_action)])
                        obs, step_reward, done, info = env.step(current_action)
                        reward += step_reward
                        path_i += 1
                    i_path += 1
                    path_i = 0

        print("x", np.array(x).shape)
        print("y", np.array(y).shape)
        x_data = open('./data/x_data_' + str(i), 'w')
        x_data = open('./data/x_data_' + str(i), 'a')
        for xi in x:
            x_data.write(' '.join(xi) + '\n')
        y_data = open('./data/y_data_' + str(i), 'w')
        y_data = open('./data/y_data_' + str(i), 'a')
        for yi in y:
            y_data.write(str(yi) + ' ')
class Worker(threading.Thread):
    episode_count = 0
    running_reward = 0
    best_score = 0
    global_steps = 0
    save_lock = threading.Lock()

    def __init__(self, result_queue, params, save_dir):
        super(Worker, self).__init__()
        self.result_queue = result_queue
        self.save_dir = save_dir
        self.model_path = os.path.join(self.save_dir, 'model_a3c_distributed')

        self.env = ObstacleTowerEnv(params['env_path'],
                                    worker_id=1,
                                    retro=False,
                                    realtime_mode=False,
                                    greyscale=False,
                                    config=train_env_reset_config)

        self.action_size = params['action_size']
        self._action_lookup = params['action_lookup']
        self.input_shape = self.env.observation_space[0].shape  # (84, 84, 3)
        self._last_health = 99999.
        self._last_keys = 0

        self.global_model = params['global_model']
        self.mirrored_strategy = tf.distribute.MirroredStrategy()
        with self.mirrored_strategy.scope():
            # self.local_model = CNN(self.action_size, self.input_shape)
            self.local_model = CnnGru(self.action_size, self.input_shape)

        self.current_time = params['log_timestamp']
        train_log_dir = './logs/' + self.current_time + '/worker_1'
        self.worker_summary_writer = tf.summary.create_file_writer(
            train_log_dir)

        self.timesteps = params['timesteps']
        self.batch_size = params['batch_size']
        self.gamma = params['gamma']
        self.lr = params['lr']
        self.opt = params['optimizer']
        self.eps = np.finfo(np.float32).eps.item()
        self.ep_loss = 0.0

    def get_updated_reward(self, reward, new_health, new_keys, done):
        new_health = float(new_health)
        if done:  # penalize when game is terminated
            self._last_health = 99999.
            self._last_keys = 0
            reward = -1
        else:
            # crossing a floor- between [1, 4]
            if reward >= 1:
                reward += (new_health / 10000)

            # found time orb / crossed a floor
            if new_health > self._last_health:
                reward += 0.1

            # found a key
            if new_keys > self._last_keys:
                reward += 0.1

        return reward

    def log_worker_metrics(self, episode_reward, avg_reward, loss, step):
        with self.worker_summary_writer.as_default():
            tf.summary.scalar('reward', episode_reward, step=step)
            tf.summary.scalar('moving_reward', avg_reward, step=step)
            tf.summary.scalar('loss', loss, step=step)
            self.worker_summary_writer.flush()

    def run(self):
        mem = Memory()
        rewards = []
        ep_count = 1
        timestep = 0
        entropy_term = 0
        ep_reward = 0.
        ep_steps = 0
        ep_loss = 0.

        done = False
        obs = self.env.reset()
        state, _, _, _ = obs

        while timestep <= self.timesteps:
            with tf.GradientTape() as tape:
                for i in range(self.batch_size):
                    # collect experience
                    # get action as per policy
                    state = tf.convert_to_tensor(state)
                    state = tf.expand_dims(state, axis=0)
                    action_probs, critic_value = self.local_model(
                        state, training=True)

                    entropy = -np.sum(action_probs * np.log(action_probs))
                    entropy_term += entropy

                    # choose most probable action
                    action_index = np.random.choice(self.action_size,
                                                    p=np.squeeze(action_probs))
                    action = self._action_lookup[action_index]

                    # perform action in game env
                    for i in range(4):  # frame skipping
                        obs, reward, done, _ = self.env.step(action)
                        state, new_keys, new_health, cur_floor = obs

                        reward = self.get_updated_reward(
                            reward, new_health, new_keys, done)
                        self._last_health = new_health
                        self._last_keys = new_keys

                        ep_reward += reward
                        ep_steps += 1
                        timestep += 1

                    # store experience
                    mem.store(action_prob=action_probs[0, action_index],
                              value=critic_value[0, 0],
                              reward=reward)

                    if done:
                        break

                # backpropagation
                total_loss = self.local_model.compute_loss(
                    mem, state, done, self.gamma, self.eps, entropy_term)
                ep_loss += total_loss
                Worker.global_steps += ep_steps

            grads = tape.gradient(total_loss,
                                  self.local_model.trainable_variables
                                  )  # calculate local gradients
            # self.opt.apply_gradients(zip(grads, self.global_model.trainable_variables))  # send local gradients to global model
            # self.local_model.set_weights(self.global_model.get_weights())  # update local model with new weights
            mem.clear()

            if done:
                rewards.append(ep_reward)
                Worker.running_reward = sum(rewards[-10:]) / 10

                self.log_worker_metrics(ep_reward, Worker.running_reward,
                                        ep_loss, ep_count)
                print(
                    "Episode: {} | Average Reward: {:.3f} | Episode Reward: {:.3f} | Loss: {:.3f} | Steps: {} | Total Steps: {} | Worker: {}"
                    .format(Worker.episode_count, Worker.running_reward,
                            ep_reward, ep_loss, ep_steps, Worker.global_steps,
                            1))
                self.result_queue.put((Worker.running_reward, total_loss))
                Worker.episode_count += 1
                ep_count += 1

                obs = self.env.reset()
                state, _, _, _ = obs

                # use a lock to save local model and to print to prevent data races.
                if ep_reward > Worker.best_score:
                    with Worker.save_lock:
                        print(
                            '\nSaving best model to: {}, episode score: {}\n'.
                            format(self.model_path, ep_reward))
                        keras.models.save_model(self.local_model,
                                                self.model_path)
                        Worker.best_score = ep_reward

                entropy_term = 0
                ep_reward = 0.
                ep_steps = 0
                ep_loss = 0.

        keras.models.save_model(self.local_model, self.model_path)
        self.result_queue.put(None)
        self.env.close()
コード例 #18
0
        # tower 0, floor 10 = second room holds key
        config = {
            'tower-seed': 0,
            'starting-floor': 10,
            'dense-reward': 1,
            'agent-perspective': 1,
            'allowed-rooms': 1,
            'allowed-modules': 0,
            'allowed-floors': 0
        }
        obs = env.reset(config=config)
        next_observe = obs

        for _ in range(random.randint(1, 20)):
            observe = next_observe
            next_observe, _, _, _ = env.step(1)

        state = pre_processing(next_observe, observe)
        history = np.stack((state, state, state, state), axis=2)
        history = np.reshape([history], (1, 84, 84, 4))
        print(history)

        while not done:
            env.render()
            step += 1
            observe = next_observe

            action = agent.get_action(history)
            print("Action:" + str(action))

            fake_action = action
コード例 #19
0
def main():
    #Load parse parameters
    #parser = otc_arg_parser()
    #args = parser.parse_args()

    #Challenge environment
    # if args.env == 'ObtRetro-v6':
    #     env = ObstacleTowerEnv(
    #         '/home/home/Data/Carmen/py_workspace/ObstacleTower_v3/ObstacleTower-v3.1/obstacletower.x86_64',
    #         timeout_wait=6000,
    #         retro=args.retro,
    #         realtime_mode=args.test)
    #     env = RetroWrapper(env, args.sample_normal)
    #     env = OTCPreprocessing(env, args.action_reduction)
    #     # if show_obs:
    #     #     env = RenderObservations(env)
    #     #     env = KeyboardControlWrapper(env)
    # else:
    env = ObstacleTowerEnv(
        '/home/home/Data/Carmen/py_workspace/ObstacleTower_v3/ObstacleTower-v3.1/obstacletower.x86_64',
        retro=args.retro,
        realtime_mode=args.test,
        timeout_wait=6000)

    #env = ObstacleTowerEnv('OBSTACLE_TOWER_PATH', retro=args.retro, realtime_mode=args.test, timeout_wait=6000)

    #Dict of actions created by the ObstacleTowerEnv Class of obstacle_tower_env library
    #print("ACTIONS:", env._flattener.action_lookup)

    print('FEATURES :', args.features)

    #Preprocess the environment (Grey Scales and action space reduction)
    env = OTCPreprocessing(env, args.action_reduction, args.features)
    env = DummyVecEnv([lambda: env])
    #env = VecEnv(1, env.observation_space, env.action_space)

    print("ACTION SPACE  ///////////:", env.action_space)
    print("OBSERVATION SPACE ///////////////:", env.observation_space)
    #env = make_vec_env(env, n_envs=4)

    ########Training########

    #Study of the impact of different values of the PPO params
    if args.study:
        params_test(MlpPolicy, env)

    #If no Study Mode
    else:
        #If no Test Mode
        if not args.test:

            seed = random.seed(0)

            if args.pretrained_model:

                t = 300000

                model = PPO2.load(args.pretrained_model,
                                  env=env,
                                  tensorboard_log=args.tensorboard_logdir)

            else:

                t = 0

                #If Generalized Advantage Estimator is used
                if args.use_gae:

                    model = PPO2(MlpPolicy,
                                 env,
                                 n_steps=args.num_steps,
                                 verbose=1,
                                 tensorboard_log=args.tensorboard_logdir,
                                 cliprange=args.clip_param,
                                 learning_rate=args.lr,
                                 ent_coef=args.entropy_coef,
                                 vf_coef=args.value_loss_coef,
                                 max_grad_norm=args.max_grad_norm,
                                 gamma=args.gamma,
                                 lam=args.gae_lambda,
                                 noptepochs=args.ppo_epoch,
                                 seed=seed)

                #If Generalized Advantage Estimator is not used
                else:

                    model = PPO2(MlpPolicy,
                                 env,
                                 n_steps=args.num_steps,
                                 verbose=1,
                                 tensorboard_log=args.tensorboard_logdir,
                                 cliprange=args.clip_param,
                                 learning_rate=args.lr,
                                 ent_coef=args.entropy_coef,
                                 vf_coef=args.value_loss_coef,
                                 max_grad_norm=args.max_grad_norm,
                                 gamma=args.gamma,
                                 noptepochs=args.ppo_epoch,
                                 seed=seed)
        else:

            model = PPO2.load(args.pretrained_model, env=env)

        #model.learn(total_timesteps=50000)
        #model.save("ObstacleTower_prueba")

        filename = 'argsparams.txt'
        os.makedirs(args.results_dir, exist_ok=True)
        myfile = open(args.results_dir + filename, 'a')
        myfile.write(
            'clip range: %f \n learning rate: %f \n coeficiente de entropía: %f \n coeficiente de pérdida: %f \n '
            'máximo gradiente: %f \n gamma: %f \n ppo epoch: %f \n' %
            (args.clip_param, args.lr, args.entropy_coef, args.value_loss_coef,
             args.max_grad_norm, args.gamma, args.ppo_epoch))
        myfile.close()

        if not args.test:
            while t < args.num_env_steps:
                #TRAIN MODEL
                if t == 0:
                    model.learn(total_timesteps=args.eval_interval)

                else:
                    model.learn(total_timesteps=args.eval_interval,
                                reset_num_timesteps=False)

                os.makedirs(GLOBAL_PATH, exist_ok=True)
                print("Saving in '" + GLOBAL_PATH + "'")
                model.save(GLOBAL_PATH + args.training_name + "_" +
                           str(int(t)).zfill(10))

                avg_reward, avg_floor = test(
                    t, model, env=env, global_path=args.results_dir)  # Test
                log('T = ' + str(t) + ' / ' + str(args.num_env_steps) +
                    ' | Avg. reward: ' + str(avg_reward) + ' | Avg. floor: ' +
                    str(avg_floor))

                t += args.eval_interval
        else:
            obs = env.reset()
            t = 0
            while t < args.num_env_steps:

                action, _states = model.predict(obs)
                obs, rewards, done, info = env.step(action)
                #print('action :', info)
                env.render('rgb_array')
コード例 #20
0
import os
import random

from obstacle_tower_env import ObstacleTowerEnv

counter = {}
env = ObstacleTowerEnv(os.environ['OBS_TOWER_PATH'], worker_id=2)
while True:
    env.seed(random.randrange(100))
    env.reset()
    for _ in range(50):
        obs, _, _, _ = env.step(0)
    key = str(obs.flatten().tolist())
    counter[key] = True
    print('got %d start states' % len(counter))
コード例 #21
0
ファイル: hang.py プロジェクト: wwxFromTju/obs-tower2
        21, 24, 24, 24, 24, 24, 24, 18, 18, 30, 30, 30, 30, 30, 30, 24, 24, 24,
        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 18, 18, 30, 30, 30, 24, 24, 24,
        30, 30, 30, 30, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 30,
        30, 30, 18, 24, 24, 24, 24, 18, 18, 18, 18, 30, 30, 30, 24, 24, 24, 24,
        24, 24, 24, 24, 24, 24, 24, 24, 24, 18, 30, 30, 30, 24, 24, 18, 30, 30,
        30, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 18, 30, 30, 18,
        18, 24, 24, 24, 18, 18, 18, 18, 30, 30, 24, 24, 24, 24, 18, 18, 30, 30,
        30, 30, 30, 30, 30, 18, 18, 18, 30, 30, 30, 30, 30, 30, 30, 30, 18, 24,
        18, 24, 30, 30, 18, 18, 18, 24, 24, 18, 30, 30, 30, 24, 24, 24, 24, 24,
        30, 30, 30, 30, 24, 24, 30, 30, 24, 18, 21, 30, 30, 30, 30, 30, 30, 30,
        30, 30, 30, 30, 18, 18, 18, 18, 24, 30, 18, 24, 30, 24, 24, 24, 30, 30,
        30, 30, 30, 24, 24, 24, 24, 24, 24, 24, 18, 18, 21, 24, 24, 24, 24, 24,
        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 30, 30, 30,
        30, 30, 30, 30, 18, 18
]:
    env.step(action)

env.seed(58)
env.floor(10)
env.reset()
for action in [
        18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 30, 30, 30, 30, 18, 18, 18,
        18, 30, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 24, 24, 24, 18,
        18, 18, 18, 18, 18, 18, 18, 30, 30, 30, 24, 30, 30, 30, 30, 30, 30, 30,
        24, 24, 30, 30, 30, 30, 24, 24, 24, 24, 30, 30, 30, 30, 30, 30, 30, 24,
        18, 18, 18, 18, 18, 24, 21, 18, 30, 30, 24, 18, 18, 18, 30, 30, 30, 30,
        30, 30, 30, 30, 24, 24, 24, 30, 30, 18, 18, 18, 18, 18, 18, 18, 18, 18,
        18, 18, 30, 24, 24, 24, 24, 18, 18, 18, 18, 18, 24, 18, 18, 24, 18, 18,
        18, 18, 18, 18, 24, 18, 18, 18, 18, 24, 18, 18, 18, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 24, 24, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 6, 6, 6, 6, 6, 6,
コード例 #22
0
class WrappedObstacleTowerEnv():

    def __init__(
        self,
        environment_filename=None,
        docker_training=False,
        worker_id=0,
        retro=False,
        timeout_wait=3000,
        realtime_mode=False,
        num_actions=3,
        stack_size=4,
        mobilenet=False,
        gray_scale=False,
        floor=0,
        visual_theme=0
        ):
        '''
        Arguments:
          environment_filename: The file path to the Unity executable.  Does not require the extension.
          docker_training: Whether this is running within a docker environment and should use a virtual
            frame buffer (xvfb).
          worker_id: The index of the worker in the case where multiple environments are running.  Each
            environment reserves port (5005 + worker_id) for communication with the Unity executable.
          retro: Resize visual observation to 84x84 (int8) and flattens action space.
          timeout_wait: Time for python interface to wait for environment to connect.
          realtime_mode: Whether to render the environment window image and run environment at realtime.
        '''

        self._obstacle_tower_env = ObstacleTowerEnv(environment_filename,
                                                    docker_training,
                                                    worker_id,
                                                    retro,
                                                    timeout_wait,
                                                    realtime_mode)
        if floor is not 0:
            self._obstacle_tower_env.floor(floor)
        self.start_floor = floor
        self.current_floor = floor

        self.mobilenet = mobilenet
        self.gray_scale = gray_scale
        self.retro = retro
        if mobilenet:
            self.state_size = [1280]
        elif gray_scale:
            self.state_size = [84, 84, 1]
        elif retro:
            self.state_size = [84, 84, 3]
        else:
            self.state_size = [168, 168, 3]

        self.stack_size = stack_size
        self.stack = [np.random.random(self.state_size).astype(np.float32) for _ in range(self.stack_size)]
        self.total_reward = 0
        self.current_reward = 0
        self.max_floor = 25
        self.visual_theme = visual_theme

        self.id = worker_id

    def gray_preprocess_observation(self, observation):
        '''
        Re-sizes obs to 84x84 and compresses to grayscale
        '''
        observation = (observation * 255).astype(np.uint8)
        obs_image = Image.fromarray(observation)
        obs_image = obs_image.resize((84, 84), Image.NEAREST)
        gray_observation = np.mean(np.array(obs_image),axis=-1,keepdims=True)
        return gray_observation / 255

    def mobile_preprocess_observation(self, observation):
        """
        Re-sizes obs to 224x224 for mobilenet
        """
        observation = (observation * 255).astype(np.uint8)
        obs_image = Image.fromarray(observation)
        obs_image = obs_image.resize((224, 224), Image.NEAREST)
        return self.mobilenet(np.array(obs_image))

    def reset(self):
        # Reset env, stack and floor
        # (We save state as an attribute so child objects can access it)
        config = {"total-floors": 15}
        self.state = self._obstacle_tower_env.reset(config)
        self.state, reward, done, info = self._obstacle_tower_env.step(18)
        self.current_floor = self.start_floor
        self.stack = [np.random.random(self.state_size).astype(np.float32) for _ in range(self.stack_size)]
        self.total_reward = 0
        self.current_reward = 0

        # Preprocess current obs and add to stack
        if self.retro:
            observation = (self.state / 255).astype(np.float32)
        else:
            observation, key, time = self.state

        if self.mobilenet:
            observation = self.mobile_preprocess_observation(observation)
        elif self.gray_scale:
            observation = self.gray_preprocess_observation(observation)

        self.stack = self.stack[1:] + [observation]

        # Build our state (MUST BE A TUPLE)
        #one_hot_floor = tf.one_hot(self.current_floor, self.max_floor).numpy()
        one_hot_floor = np.zeros(self.max_floor)
        one_hot_floor[self.current_floor] += 1
        floor_data = np.append(one_hot_floor, self.current_reward).astype(np.float32)
        stacked_state = np.concatenate(self.stack, axis=-1).astype(np.float32)
        if self.retro is True:
            ret_state = (stacked_state, floor_data)
        else:
            # Clip time to 2000, then normalize
            time = (2000. if time > 2000 else time) / 2000.
            key_time_data = np.array([key, time]).astype(np.float32)
            #key_time_data = np.array([key]).astype(np.float32)
            ret_state = (stacked_state, floor_data, key_time_data)

        return ret_state, info

    def step(self, action):
        # Convert int action to vector required by the env
        if self.retro:
            if action == 0: # forward
                action = 18
            elif action == 1: # rotate camera left
                action = 24
            elif action == 2: # rotate camera right
                action = 30
            elif action == 3: # jump forward
                action = 21
            elif action == 4:
                action = 6
            elif action == 5:
                action = 12
        else:
            if action == 0: # forward
                action = [1, 0, 0, 0]
            elif action == 1: # rotate camera left
                action = [1, 1, 0, 0]
            elif action == 2: # rotate camera right
                action = [1, 2, 0, 0]
            elif action == 3: # jump forward
                action = [1, 0, 1, 0]

        # Take the step and record data
        # (We save state as an attribute so child objects can access it)
        self.state, reward, done, info = self._obstacle_tower_env.step(action)

        # Keep track of current floor reward and total reward
        if reward >= 0.95:
            self.current_floor += 1
            self.current_reward = 0
            done = True
        else:
            self.current_reward += reward
        self.total_reward += reward
        
        if (done and reward < 0.95) or self.current_floor == 15:
            # Save info and reset when an episode ends
            info["episode_info"] = {"floor": self.current_floor, "total_reward": self.total_reward}
            ret_state, _ = self.reset()
        else:
            # Preprocess current obs and add to stack
            if self.retro:
                observation = (self.state / 255).astype(np.float32)
            else:
                observation, key, time = self.state

            if self.mobilenet:
                observation = self.mobile_preprocess_observation(observation)
            elif self.gray_scale:
                observation = self.gray_preprocess_observation(observation)

            self.stack = self.stack[1:] + [observation]

            # Build our state (MUST BE A TUPLE)
            #one_hot_floor = tf.one_hot(self.current_floor, self.max_floor).numpy()
            one_hot_floor = np.zeros(self.max_floor)
            one_hot_floor[self.current_floor] += 1
            floor_data = np.append(one_hot_floor, self.current_reward).astype(np.float32)
            stacked_state = np.concatenate(self.stack, axis=-1).astype(np.float32)
            if self.retro is True:
                ret_state = (stacked_state, floor_data)
            else:
                # Clip time to 2000, then normalize
                time = (2000. if time > 2000 else time) / 2000.
                key_time_data = np.array([key, time]).astype(np.float32)
                #key_time_data = np.array([key]).astype(np.float32)
                ret_state = (stacked_state, floor_data, key_time_data)

        return ret_state, reward, done, info

    def close(self):
        self._obstacle_tower_env.close()

    def floor(self, floor):
        self._obstacle_tower_env.floor(floor)
        self.start_floor = floor