Ejemplo n.º 1
0
def run_experiment(intrinsic_motivation, gamma, c, mc_n, runs, steps):
    trajectories = []
    start = np.array([50, 50])
    true_belief = True

    for _ in range(runs):
        goal = draw_goal(start, 6)
        manual = draw_goal(start, 3)
        print("Goal: {}".format(goal))
        print("Manual: {}".format(manual))

        world = state.ToyWorld([100, 100], intrinsic_motivation, goal, manual)
        belief = None
        if true_belief:
            belief = dict(
                zip(
                    [
                        state.ToyWorldAction(np.array([0, 1])),
                        state.ToyWorldAction(np.array([0, -1])),
                        state.ToyWorldAction(np.array([1, 0])),
                        state.ToyWorldAction(np.array([-1, 0])),
                    ],
                    [[10, 10, 10, 10], [10, 10, 10, 10], [10, 10, 10, 10], [10, 10, 10, 10]],
                )
            )
        root_state = state.ToyWorldState(start, world, belief=belief)
        print(root_state.pos)
        next_state = StateNode(None, root_state, 0)
        trajectory = []
        for _ in range(steps):
            try:
                ba = mcts_search(next_state, gamma, c=c, n=mc_n)
                print("")
                print("=" * 80)
                print("State: {}".format(next_state.state))
                print("Belief: {}".format(next_state.state.belief))
                print("Reward: {}".format(next_state.reward))
                print("N: {}".format(next_state.n))
                print("Q: {}".format(next_state.q))
                print("Action: {}".format(ba.action))
                trajectory.append(next_state.state.pos)
                if (next_state.state.pos == np.array(goal)).all():
                    break
                next_s = next_state.children[ba].sample_state(real_world=True)
                next_state = next_s
                next_state.parent = None
            except KeyboardInterrupt:
                break
        trajectories.append(trajectory)
        with open(gen_name("trajectories", "pkl"), "w") as f:
            pickle.dump(trajectories, f)
        print("=" * 80)
Ejemplo n.º 2
0
def run_experiment(intrinsic_motivation, gamma, c, mc_n, runs, steps):
    trajectories = []
    start = np.array([50, 50])
    true_belief = True

    for _ in range(runs):
        goal = draw_goal(start, 6)
        manual = draw_goal(start, 3)
        print("Goal: {}".format(goal))
        print("Manual: {}".format(manual))

        world = state.ToyWorld([100, 100], intrinsic_motivation, goal, manual)
        belief = None
        if true_belief:
            belief = dict(
                zip([
                    state.ToyWorldAction(np.array([0, 1])),
                    state.ToyWorldAction(np.array([0, -1])),
                    state.ToyWorldAction(np.array([1, 0])),
                    state.ToyWorldAction(np.array([-1, 0]))
                ], [[10, 10, 10, 10], [10, 10, 10, 10], [10, 10, 10, 10],
                    [10, 10, 10, 10]]))
        root_state = state.ToyWorldState(start, world, belief=belief)
        print(root_state.pos)
        next_state = StateNode(None, root_state, 0)
        trajectory = []
        for _ in range(steps):
            try:
                ba = mcts_search(next_state, gamma, c=c, n=mc_n)
                print("")
                print("=" * 80)
                print("State: {}".format(next_state.state))
                print("Belief: {}".format(next_state.state.belief))
                print("Reward: {}".format(next_state.reward))
                print("N: {}".format(next_state.n))
                print("Q: {}".format(next_state.q))
                print("Action: {}".format(ba.action))
                trajectory.append(next_state.state.pos)
                if (next_state.state.pos == np.array(goal)).all():
                    break
                next_s = next_state.children[ba].sample_state(real_world=True)
                next_state = next_s
                next_state.parent = None
            except KeyboardInterrupt:
                break
        trajectories.append(trajectory)
        with open(gen_name("trajectories", "pkl"), "w") as f:
            pickle.dump(trajectories, f)
        print("=" * 80)
Ejemplo n.º 3
0
def run_experiment(intrinsic_motivation, gamma, c, mc_n, runs, steps, problem):
    st1 = time.time()
    # trajectories = []
    start = np.array([50, 50])
    true_belief = True

    mcts_search = MCTS(tree_policy=UCB1(c=1.41),
                       default_policy=immediate_reward,
                       backup=monte_carlo)

    rewards = []
    for r in range(runs):
        sta = time.time()
        print("RUN number", r)
        goal = draw_goal(start, 6)
        # manual = draw_goal(start, 3)
        # print("Goal: {}".format(goal))

        world = PaintingWorld((100, 100), False, (100, 100), problem)
        belief = None
        root_state = PaintingWorldState((0, 0), (1, 1, 1), world)
        if true_belief:
            belief = {}
            for action in root_state.actions:
                belief[action] = [1] * len(root_state.actions)
            root_state.belief = belief
        # print(root_state.pos)
        next_state = StateNode(None, root_state)
        # trajectory =[]
        rew = 0
        for step in range(steps):
            st = time.time()
            ba = mcts_search(next_state, n=mc_n)
            # print("=" * 80)
            # print("State: {}".format(next_state.state))
            # # print("Belief: {}".format(next_state.state.belief))
            # print("Reward: {}".format(next_state.reward))
            # print("N: {}".format(next_state.n))
            # print("Q: {}".format(next_state.q))
            # print("Action: {}".format(ba.action))
            # trajectory.append(next_state.state.pos)
            rew = next_state.reward
            if (next_state.state.pos == np.array(goal)).all():
                break
            next_s = next_state.children[ba].sample_state(real_world=True)
            next_state = next_s
            next_state.parent = None

            en = time.time()
            print("step", step, "time elapsed", en - st)

            if step >= 5 and rew > 0.5:
                break

            # except KeyboardInterrupt:
            #     break
        # trajectories.append(trajectory)
        # print (next_state.reward)
        rewards.append(rew)

        # with open(gen_name("trajectories", "pkl"), "w") as f:
        #     pickle.dump(trajectories, f)
        # print("=" * 80)
        end = time.time()
        print("run", r, "time elapsed", end - sta)
        # if rewards[-1] > 0:
        #     break
    w = max(rewards)
    print("REWARD", w)
    end1 = time.time()
    print("problem time elapsed", end1 - st1)
    return w