def summarize(agent, env, settings: Dict[str, Any], seeds: List[int], writer: SummaryWriter): rewards = [0] * len(seeds) for i, seed in enumerate(seeds): if seed is not None: env.seed(seed) state = env.reset() goal_point = find(state, 'Goal') option = Option(goal_point, depth=0) agent.reset(env, option, random_seed=3) done = False while not done: action = agent.act(state, option) state, reward, done, _ = env.step(action) rewards[i] += reward writer.add_hparams( { key: value for (key, value) in settings.items() if key not in ['device'] }, { 'average reward': np.mean(rewards), 'min reward': np.min(rewards), 'max reward': np.max(rewards) })
def visualize(agent: SMARTAgent, ep: int, ts: int): images = [] for seed in seeds: if seed is not None: env.seed(seed) state = env.reset() goal_point = find(state, 'Goal') option = Option(goal_point, depth=0) agent.reset(env, option, random_seed=3) visualize_decision(agent, state, writer, f'likelihoods: {seed}', ep, ts) images.append(env.render('rgb_array')) done = False while not done: action = agent.act(state, option) state, reward, done, _ = env.step(action) options = _get_option_tree_(agent) print( f"@{onehot2directedpoint(state)} : {reward} => {options}") rendered = _render_options_(env.render('rgb_array'), options) images.append(rendered) gif = np.stack(images, 0) # np.ndarray [t, imx, imy, 3] gif_tensor: torch.Tensor = torch.from_numpy(gif).type( torch.uint8).unsqueeze(0) # torch.Tensor[uint8] [1, t, imx, imy, 3] gif_tensor = gif_tensor.permute(0, 1, 4, 2, 3) writer.add_video('sample trajectory', gif_tensor, global_step=ts)
def generate(self, state: OneHotImg, prev_option: Optional[Option[Point]], parent_option: Option[Point]) -> List[Option[Point]]: xdim: int = state.shape[0] ydim: int = state.shape[1] depth: int = parent_option.depth if prev_option is not None: depth = max(prev_option.depth, depth) child_depth: int = depth + 1 result: List[Option[Point]] = [] # not quite right, but haven't figured out good solution for x in range(xdim): for y in range(ydim): point: Point = np.asarray([x, y], dtype=np.int8) if tile_type(state, point) in ['Empty', 'Goal']: result.append(Option(point, child_depth)) return result
def train(agent, env, settings, testfn=None, vizfn=None, savefn=None): seeds = _get_seeds_(settings) ts = 0 test_after_episode = False viz_after_episode = False for ep in range(settings['N_EPISODES']): env.seed(next(seeds)) state = env.reset() goal_point = find(state, 'Goal') option = Option(goal_point, depth=0) agent.reset(env, option, random_seed=3) done = False while not done: action = agent.act(state, option) state, reward, done, _ = env.step(action) agent.view(Transition(state, action, reward)) ts += 1 if settings['TEST_FREQ'] is not None and ts % settings[ 'TEST_FREQ'] == 0: test_after_episode = True if settings[ 'VIZ_FREQ'] is not None and ts % settings['VIZ_FREQ'] == 0: viz_after_episode = True agent.optimize() if test_after_episode: testfn(agent, ep, ts) test_after_episode = False if viz_after_episode: vizfn(agent, ep, ts) viz_after_episode = False if savefn is not None: savefn(agent)
def test(agent: SMARTAgent, ep, ts): rewards = [0] * len(seeds) for i, seed in enumerate(seeds): env.seed(seed) state = env.reset() goal_point = find(state, 'Goal') option = Option(goal_point, depth=0) agent.reset(env, option, random_seed=3) done = False while not done: action = agent.act(state) state, reward, done, info = env.step(action) rewards[seed] += reward for i, seed in enumerate(seeds): writer.add_scalar(f"Test Reward: {seed}", rewards[i], global_step=ts)
def generate(self, state: State, option: Option[OptionData]) -> List[Option[OptionData]]: possibilities: List[Option] = self.env._all_tiles_of_type("Empty") return list( map(lambda point: Option(point, option.depth + 1), possibilities))
result = [] option_node = agent.current_option_node prev_option = None while option_node is not None: if prev_option is None or option_node.left == prev_option: result.append(option_node.value) prev_option = option_node option_node = option_node.parent return result images = [] for _ in range(N_EPISODES): state = env.reset() goal_point = find(state, 'Goal') option = Option(goal_point, depth=0) agent.reset(env, option, random_seed=3) images.append([plt.imshow(env.render('rgb_array'), animated=True)]) done = False while not done: action = agent.act(state, option) state, reward, done, _ = env.step(action) options = get_option_tree(agent) print(f"@{onehot2directedpoint(state)} : {reward} => {options}") rendered = visualize(env.render('rgb_array'), options) images.append([plt.imshow(rendered, animated=True)]) fig = plt.figure() ani = animation.ArtistAnimation(fig, images,
random_seed=settings['random']) agent: SMARTAgent = SMARTAgent(evaluator, generator, planning_terminator, policy_terminator, low_level, memory, settings) step: int = 0 images = [] for seed in [0] * 500: env = MazeWorld(cache._get_cached_board(seed)) total_reward: int = 0 t: int = 0 done: bool = False state, goal = env.reset(3) goal = Option(goal, 0) states: List[State] = state agent.reset(env, goal) while not done: print('step') action: Action = agent.act(state) new_state, reward, done, info = env.step(action) total_reward += reward states.append(Transition(state, action, reward, new_state)) state = new_state agent.optimize(step) t += 1 step += 1 def render(env: MazeWorld, state: State):