def getMove(self): if self.isBlueBan() or self.isBluePick(): return self.mcts(StateNode(parent=None, state=self), teamIsBlue=True, n=self.rolloutNumber) else: return self.mcts(StateNode(parent=None, state=self), teamIsBlue=False, n=self.rolloutNumber)
def run_experiment(intrinsic_motivation, gamma, c, mc_n, runs, steps): trajectories = [] start = np.array([50, 50]) true_belief = True for _ in range(runs): goal = draw_goal(start, 6) manual = draw_goal(start, 3) print("Goal: {}".format(goal)) print("Manual: {}".format(manual)) world = state.ToyWorld([100, 100], intrinsic_motivation, goal, manual) belief = None if true_belief: belief = dict( zip( [ state.ToyWorldAction(np.array([0, 1])), state.ToyWorldAction(np.array([0, -1])), state.ToyWorldAction(np.array([1, 0])), state.ToyWorldAction(np.array([-1, 0])), ], [[10, 10, 10, 10], [10, 10, 10, 10], [10, 10, 10, 10], [10, 10, 10, 10]], ) ) root_state = state.ToyWorldState(start, world, belief=belief) print(root_state.pos) next_state = StateNode(None, root_state, 0) trajectory = [] for _ in range(steps): try: ba = mcts_search(next_state, gamma, c=c, n=mc_n) print("") print("=" * 80) print("State: {}".format(next_state.state)) print("Belief: {}".format(next_state.state.belief)) print("Reward: {}".format(next_state.reward)) print("N: {}".format(next_state.n)) print("Q: {}".format(next_state.q)) print("Action: {}".format(ba.action)) trajectory.append(next_state.state.pos) if (next_state.state.pos == np.array(goal)).all(): break next_s = next_state.children[ba].sample_state(real_world=True) next_state = next_s next_state.parent = None except KeyboardInterrupt: break trajectories.append(trajectory) with open(gen_name("trajectories", "pkl"), "w") as f: pickle.dump(trajectories, f) print("=" * 80)
def run_experiment(intrinsic_motivation, gamma, c, mc_n, runs, steps): trajectories = [] start = np.array([50, 50]) true_belief = True for _ in range(runs): goal = draw_goal(start, 6) manual = draw_goal(start, 3) print("Goal: {}".format(goal)) print("Manual: {}".format(manual)) world = state.ToyWorld([100, 100], intrinsic_motivation, goal, manual) belief = None if true_belief: belief = dict( zip([ state.ToyWorldAction(np.array([0, 1])), state.ToyWorldAction(np.array([0, -1])), state.ToyWorldAction(np.array([1, 0])), state.ToyWorldAction(np.array([-1, 0])) ], [[10, 10, 10, 10], [10, 10, 10, 10], [10, 10, 10, 10], [10, 10, 10, 10]])) root_state = state.ToyWorldState(start, world, belief=belief) print(root_state.pos) next_state = StateNode(None, root_state, 0) trajectory = [] for _ in range(steps): try: ba = mcts_search(next_state, gamma, c=c, n=mc_n) print("") print("=" * 80) print("State: {}".format(next_state.state)) print("Belief: {}".format(next_state.state.belief)) print("Reward: {}".format(next_state.reward)) print("N: {}".format(next_state.n)) print("Q: {}".format(next_state.q)) print("Action: {}".format(ba.action)) trajectory.append(next_state.state.pos) if (next_state.state.pos == np.array(goal)).all(): break next_s = next_state.children[ba].sample_state(real_world=True) next_state = next_s next_state.parent = None except KeyboardInterrupt: break trajectories.append(trajectory) with open(gen_name("trajectories", "pkl"), "w") as f: pickle.dump(trajectories, f) print("=" * 80)
def test_ucb1(): ucb1 = tree_policies.UCB1(1) parent = StateNode(None, UCBTestState()) an = parent.children[0] an.n = 1 parent.n = 1 assert ucb1(an) == 0 an.n = 0 parent.n = 1 assert np.isnan(ucb1(an)) an.n = 1 parent.n = 0 assert np.isnan(ucb1(an)) an.q = 1 an.n = 1 parent.n = 1 assert ucb1(an) == 1
def Search(self, players, deck, turn): # c = sqrt(2) tree_policy = tree_policies.UCB1(np.sqrt(2)) # default_policy = default_policies.random_terminal_roll_out default_policy = default_policies.RandomKStepRollOut(50) backup = backups.monte_carlo current_state = self.set_current_state(deck) state = action_and_state.GOFishState(current_state) root_node = StateNode(None, state) mcts_run = mcts.MCTS(tree_policy, default_policy, backup) action = mcts_run(root_node, n=800) action = {'requestedPlayer': players[action[0]], 'card': action[1]} return action
pos[movei[2],movei[3]] = pos[movei[0],movei[1]] pos[movei[0],movei[1]] = 0 else: ValueErr("error") print(np.flipud(pos)) mcts = MCTS(tree_policy=Go(c=5), default_policy=RandomKStepRollOut_Value(20, 0.95), backup=monte_carlo) policy_fun = policy_nn() rollout_fun = rollout_nn() value_fun = value_nn() root = StateNode(None, ChessState(pos, 1, policy_fun, rollout_fun, value_fun, False )) best_action = mcts(root, n=500) pr.disable() s = io.StringIO() sortby = 'cumulative' ps = pstats.Stats(pr, stream=s).sort_stats(sortby) ps.print_stats() print(s.getvalue()) print(best_action.action)
def run_experiment(intrinsic_motivation, gamma, c, mc_n, runs, steps, problem): st1 = time.time() # trajectories = [] start = np.array([50, 50]) true_belief = True mcts_search = MCTS(tree_policy=UCB1(c=1.41), default_policy=immediate_reward, backup=monte_carlo) rewards = [] for r in range(runs): sta = time.time() print("RUN number", r) goal = draw_goal(start, 6) # manual = draw_goal(start, 3) # print("Goal: {}".format(goal)) world = PaintingWorld((100, 100), False, (100, 100), problem) belief = None root_state = PaintingWorldState((0, 0), (1, 1, 1), world) if true_belief: belief = {} for action in root_state.actions: belief[action] = [1] * len(root_state.actions) root_state.belief = belief # print(root_state.pos) next_state = StateNode(None, root_state) # trajectory =[] rew = 0 for step in range(steps): st = time.time() ba = mcts_search(next_state, n=mc_n) # print("=" * 80) # print("State: {}".format(next_state.state)) # # print("Belief: {}".format(next_state.state.belief)) # print("Reward: {}".format(next_state.reward)) # print("N: {}".format(next_state.n)) # print("Q: {}".format(next_state.q)) # print("Action: {}".format(ba.action)) # trajectory.append(next_state.state.pos) rew = next_state.reward if (next_state.state.pos == np.array(goal)).all(): break next_s = next_state.children[ba].sample_state(real_world=True) next_state = next_s next_state.parent = None en = time.time() print("step", step, "time elapsed", en - st) if step >= 5 and rew > 0.5: break # except KeyboardInterrupt: # break # trajectories.append(trajectory) # print (next_state.reward) rewards.append(rew) # with open(gen_name("trajectories", "pkl"), "w") as f: # pickle.dump(trajectories, f) # print("=" * 80) end = time.time() print("run", r, "time elapsed", end - sta) # if rewards[-1] > 0: # break w = max(rewards) print("REWARD", w) end1 = time.time() print("problem time elapsed", end1 - st1) return w
print("Video Sizes: %r" % (problem.videoSizes,)) print("Endpoints:\n\t%s" % ("\n\t".join([str(e) for e in problem.endpoints]))) print("Requests: %r" % ([r for r in problem.requests])) # Generate initial state initial_contents = list([(0, []) for _ in range(problem.nCaches)]) initial_score = 0 initial_state = TreeState(caches_contents=initial_contents, score=initial_score, problem=problem) # Generate the optimal end state mcts = MCTS(tree_policy=UCB1(c=1.41), default_policy=immediate_reward, backup=monte_carlo) node = StateNode(parent=None, state=initial_state) while True: if node.state.is_terminal(): print("Terminal node reached.") break print("Finding best action") best_action = mcts(node) print("Performing action") node = StateNode(parent=None, state=node.state.perform(best_action)) print("Score now is: %d" % node.state.score) print("Saving output") print(node.state.caches_contents) contents = node.state.caches_contents