def andrea_states(): """ Infer `T` hardmax state probability grids, one for each timestep. """ T = 5 N = 20 R = -1 beta = 1 g = GridWorldMDP(N, N, default_reward=R) init_state = g.coor_to_state(0, 0) goal = g.coor_to_state(N-1, N//2) # A numpy.ndarray with dimensions (T x g.rows x g.cols). # `state_prob[t]` holds the exact state probabilities for # a beta-irrational, softmax-action-choice-over-hardmax-values # agent. state_prob = inf.state.infer_from_start(g, init_state, goal, T=T, beta=beta, all_steps=True).reshape(T+1, g.rows, g.cols) print(state_prob) # Plot each of the T heatmaps # beware: heat map's color scale changes with each plot for t, p in enumerate(state_prob): title = "t={}".format(t) plot_heat_maps(g, init_state, [p], [title], stars_grid=[goal], auto_logarithm=False)
def experiment(self, N=10, iters=1000, mb_size=128, samples=10000): g = GridWorldMDP(N, N) goals = [g.coor_to_state(N - 1, N - 1), g.coor_to_state(0, 0)] assert self.G == len(goals) data = syn.gen_predict_actions(g, goals, self.k, self.l, samples=samples, beta=1e-3) test_data = syn.gen_predict_actions(g, goals, self.k, self.l, samples=100, beta=1e-3) with self.graph.as_default(): sess = tf.Session() sess.run(tf.global_variables_initializer()) self.train_model(sess, data, mb_size=mb_size, iters=iters, test_data=test_data) self.assess_model(sess, test_data)
def test_no_crash(self): g = GridWorldMDP(15, 15) goals = [ g.coor_to_state(9, 9), g.coor_to_state(1, 1), g.coor_to_state(3, 3) ] data = gen_predict_actions(g, goals=goals, k=3, l=3) self.assertEqual(data.N, len(data.Y))
def test_no_crash2(self): g = GridWorldMDP(15, 15) goals = [ g.coor_to_state(9, 9), g.coor_to_state(1, 1), g.coor_to_state(3, 3) ] data = gen_predict_policy(g, goals=goals, samples=30) self.assertEqual(data.N, len(data.Y)) for y in data.Y: self.assertTrue(0 <= y < g.A) for z in data.Z: self.assertTrue(0 <= z < len(goals))
def benchmark(traj_mode="diag", mode="tri", T=2, N=90, R=-1): g = GridWorldMDP(N, N, default_reward=R) g, _, start, dest_list = _occ_starter(N, R, mode) traj = _traj_starter(N, start, traj_mode)[:50] def test(): D = inf.occupancy.infer(g, traj, dest_list, T=T, verbose=False) test() import cProfile cProfile.runctx('test()', globals(), locals())
def puddles_world(N=100, p=0.2, puddle_reward=-2): """ Generate a world where some squares have scaled reward -2 or -2*sqrt(2). """ reward_dict = {} for x in range(N): for y in range(N): if random.random() < p: reward_dict[(x, y)] = -2 g = GridWorldMDP(N, N, reward_dict=reward_dict) return g
def build_deterministic_dataset(): """ Dataset which always returns all nine of the (state, action) pairs in a 3x3 gridworld where the goal is (2, 2). """ g = GridWorldMDP(3, 3) coor = g.coor_to_state A = g.Actions policy = [((0, 0), A.UP_RIGHT), ((1, 0), A.UP_RIGHT), ((2, 0), A.UP), ((0, 1), A.UP_RIGHT), ((1, 1), A.UP_RIGHT), ((2, 1), A.UP), ((0, 2), A.RIGHT), ((1, 2), A.RIGHT), ((2, 2), A.ABSORB)] X = np.array([e[0] for e in policy]) Y = np.array([e[1] for e in policy]) Z = np.array([coor(2, 2)] * 9) return Data(X, Y, Z, name="tiny deterministic")