def test_policy_iteration(self): grid = self.get_sample_grid() env = Environment(grid) planner = PolicyIterationPlanner(env) result = planner.plan() print("Policy Iteration") for r in result: print(r)
def test_value_iteration(self): grid = self.get_sample_grid() env = Environment(grid) planner = ValuteIterationPlanner(env) result = planner.plan() print("Value Iteration") for r in result: print(r)
def test_run_environment(self): grid = self.get_sample_grid() env = Environment(grid) for i in range(100): state = env.reset() # initialize agent position self.assertEqual(state.row, len(env.grid) - 1) self.assertEqual(state.column, 0) goal = False for t in range(10): action = random.choice(env.action_space) state, reward, done = env.step(action) self.assertTrue(0 <= state.row < len(env.grid)) self.assertTrue(0 <= state.column < len(env.grid[0])) if done: print("Episode {}: get reward {}, {} timesteps".format( i, reward, t + 1)) goal = True break if not goal: print("Episode {}: no reward".format(i))
def post(self): data = tornado.escape.json_decode(self.request.body) grid = data["grid"] plan_type = data["plan"] move_prob = 0.8 # default value try: move_prob = float(data["prob"]) except ValueError: pass env = Environment(grid, move_prob=move_prob) if plan_type == "value": planner = ValuteIterationPlanner(env) elif plan_type == "policy": planner = PolicyIterationPlanner(env) result = planner.plan() planner.log.append(result) self.write({"log": planner.log})