def demo(self, demo_count=50):
     all_actions_taken = []
     for _ in range(demo_count):
         block_world = BlockWorld(self.states_x,
                                  self.states_y,
                                  self.blocks_count,
                                  1,
                                  record=False)
         goal = block_world.goal_config
         all_actions_taken.append({
             "goal": goal,
             "actions": block_world.run_environment()
         })
     RLTrainer.serialize_actions(all_actions_taken)
    def random_exploration2(self):
        episode_count = 2
        prev_action = None
        for ep in range(episode_count):
            block_world = BlockWorld(self.states_x,
                                     self.states_y,
                                     self.blocks_count,
                                     1,
                                     record=True)
            print("Goal: ", [
                COLORS_STR[i] for stack in block_world.goal_config
                for i in stack
            ])
            while block_world.get_reward() != 0:
                block_world.pre_render()
                action, block_id = self.get_random_action_from_prev_action(
                    prev_action)
                print("Action chosen :", action, block_id)
                if action != Action.DROP and action != Action.PICK:
                    block_world.move_block_by_action(action, block_id)

                prev_action = action, block_id
                block_world.render()
    def q_learning_supervised(self):
        gamma = 0.5
        alpha = 0.5

        q = defaultdict(lambda: defaultdict(lambda: 0))
        demos = RLTrainer.deserialize_actions()

        for demo in demos:
            block_world = BlockWorld(self.states_x,
                                     self.states_y,
                                     self.blocks_count,
                                     1,
                                     record=False)
            block_world.create_goal(demo["goal"])
            for action in demo["actions"]:
                curr_state = BlockWorld.convert_state_dict_to_tuple(
                    action["state"])
                action, sel_id = BlockWorld.parse_action(action["action"])
                if action != Action.FINISHED:
                    block_id = sel_id if action == Action.PICK else curr_state[
                        -1]
                    next_state = block_world.get_next_state_based_on_state_tuple(
                        curr_state, (action, block_id))
                    new_reward = block_world.get_reward_for_state(
                        next_state, block_world.goal_config)
                    q_i = q[curr_state][(action, block_id)]
                    if len(q[next_state]) > 0:
                        max_q = max([
                            q[next_state][a_dash] for a_dash in q[next_state]
                        ])
                    else:
                        max_q = 0
                    q[curr_state][(action, block_id)] = (
                        (1 - alpha) * q_i) + (alpha *
                                              (new_reward + gamma * max_q))
        return q
 def random_exploration(self):
     gamma = 0.1
     q = defaultdict(lambda: 0)
     episode_count = 2
     prev_action = None
     for ep in range(episode_count):
         block_world = BlockWorld(self.states_x,
                                  self.states_y,
                                  self.blocks_count,
                                  1,
                                  record=True)
         print("Goal: ", [
             COLORS_STR[i] for stack in block_world.goal_config
             for i in stack
         ])
         while block_world.get_reward() != 0:
             block_world.pre_render()
             state = block_world.get_state_as_tuple()
             action, block_id = self.get_random_action_from_prev_action(
                 prev_action)
             next_state = block_world.get_next_state_based_on_state_tuple(
                 state, (action, block_id))
             q_val = gamma * max([
                 q[next_state, b]
                 for b in self.get_allowed_actions_from_prev_action(
                     (action, block_id))
             ])
             q[(block_world.get_state_as_tuple(),
                action)] = block_world.get_reward_for_state(
                    state, block_world.goal_config.tolist()) + q_val
             block_world.update_state_from_tuple(next_state)
             prev_action = action, block_id
             block_world.render()
    def q_learning_random(self):
        episode_count = 100
        success_count = 0
        nu = 0.9

        for ep in range(episode_count):
            block_world = BlockWorld(self.states_x,
                                     self.states_y,
                                     self.blocks_count,
                                     1,
                                     record=False)
            if self.debug:
                print("Goal: %s" % [
                    COLORS_STR[i] for stack in block_world.goal_config
                    for i in stack
                ])
            for iteration in range(5000):
                block_world.pre_render()

                curr_state = block_world.get_state_as_tuple()
                action, block_id = self.get_next_action(curr_state, q, nu)

                next_state = block_world.get_next_state_based_on_state_tuple(
                    curr_state, (action, block_id))
                is_goal_next = block_world.get_reward_for_state(
                    next_state, block_world.goal_config) == 0
                if self.debug:
                    print("Current State: ", curr_state, is_goal_next)

                block_world.update_state_from_tuple(next_state)

                block_world.render()
                if is_goal_next:
                    print("Goal State Reached!!! in %d iterations" % iteration)
                    success_count += 1
                    break

                if iteration % 100 == 1:
                    print(
                        ep,
                        iteration,
                    )
                if self.debug: print(iteration)
        if self.debug: print("success_count: ", success_count)
    def q_learning(self,
                   q=None,
                   starting_nu=1.0,
                   decay_nu=True,
                   decay_rate=0.9995):
        gamma = 0.5
        alpha = 0.5
        episode_count = 100
        if not q:
            q = defaultdict(lambda: defaultdict(lambda: 0))
        success_count = 0

        for ep in range(episode_count):
            block_world = BlockWorld(self.states_x,
                                     self.states_y,
                                     self.blocks_count,
                                     1,
                                     record=False)
            nu = starting_nu
            print("Goal: ", [
                COLORS_STR[i] for stack in block_world.goal_config
                for i in stack
            ])
            for iteration in range(self.iteration_count):
                block_world.pre_render()

                curr_state = block_world.get_state_as_tuple()
                if self.debug: print("Current State: ", curr_state)
                action, block_id = self.get_next_action(curr_state, q, nu)
                if self.debug: print("Action: ", action, block_id)

                next_state = block_world.get_next_state_based_on_state_tuple(
                    curr_state, (action, block_id))
                new_reward = block_world.get_reward_for_state(
                    next_state, block_world.goal_config)
                if self.debug: print("next_state: ", next_state)
                if self.debug: print("new_reward: ", new_reward)

                q_i = q[curr_state][(action, block_id)]

                if len(q[next_state]) > 0:
                    max_q = max(
                        [q[next_state][a_dash] for a_dash in q[next_state]])
                else:
                    max_q = 0

                if self.debug: print("max_q:", max_q)

                q[curr_state][(action, block_id)] = (
                    (1 - alpha) * q_i) + (alpha * (new_reward + gamma * max_q))
                if self.debug: print("q:", q[curr_state][(action, block_id)])

                block_world.update_state_from_tuple(next_state)

                block_world.render()
                if new_reward == 1:
                    if self.debug:
                        print("Goal State Reached!!! in %d iterations" %
                              iteration)
                    success_count += 1
                    break

                if decay_nu and iteration > 50:
                    nu = decay_rate * nu

                if iteration % 100 == 1:
                    if self.debug:
                        print("EP[%d]It[%d]: Q[%d], nu:[%f]" %
                              (ep, iteration, len(q), nu))

        if self.debug: print("success_count: ", success_count)
    def q_learning_real(self, starting_nu=0.9, use_old=True):
        alpha = 0.5
        gamma = 0.5
        converged = False
        if use_old:
            q_old = RLTrainer.load_obj(FILE_NAME)
        else:
            q_old = {}
        # q_old = defaultdict(lambda: defaultdict(lambda: 0))
        # goal_config = [np.random.permutation(self.blocks_count).tolist()]
        nu = starting_nu
        block_world = BlockWorld(self.states_x,
                                 self.states_y,
                                 self.blocks_count,
                                 1,
                                 record=False)
        if self.debug:
            print("Goal: ", [
                COLORS_STR[i] for stack in block_world.goal_config
                for i in stack
            ])

        ever_seen_goal = False
        cnt = 0
        q = q_old.copy()
        while cnt < self.iteration_count:
            cnt += 1
            # while not converged:
            block_world.pre_render()
            curr_state = block_world.get_state_as_tuple_pramodith()
            if curr_state not in q:
                q[curr_state] = {}
            if self.debug: print("Current State: ", curr_state)
            action, block_id = self.get_next_action(curr_state, q, nu)
            if self.debug: print("Action: ", action, block_id)
            next_state = block_world.get_next_state_based_on_state_tuple(
                curr_state, (action, block_id))
            new_reward = block_world.get_reward_for_state(next_state)
            new_reward += block_world.get_reward_for_state_action_pramodith(
                curr_state, next_state)
            if new_reward != 0:
                if self.debug: print("next_state: ", next_state)
                if self.debug: print("new_reward: ", new_reward)

            ever_seen_goal = ever_seen_goal or new_reward == 1
            if (action, block_id) in q[curr_state]:
                q_sa = q[curr_state][(action, block_id)]
            else:
                q_sa = 0
                q[curr_state][(action, block_id)] = 0

            if next_state in q and len(q[next_state]) > 0:
                max_q_dash_s_dash_a_dash = max(
                    [q[next_state][a_dash] for a_dash in q[next_state]])
            else:
                max_q_dash_s_dash_a_dash = 0
            if self.debug: print("max_q:", max_q_dash_s_dash_a_dash)

            q[curr_state][(
                action,
                block_id)] += alpha * (new_reward + gamma *
                                       (max_q_dash_s_dash_a_dash) - q_sa)
            if self.debug: print("q:", q[curr_state][(action, block_id)])

            block_world.update_state_from_tuple_pramodith(next_state)

            if cnt > 3000 and cnt % 500 == 0 and nu > 0.05:
                if self.debug: print(cnt)
                nu -= 0.1
            # nu *= 0.9995

            block_world.render()

            converged = ever_seen_goal and q == q_old
            # q_old = q
            # time.sleep(0.1)
        pygame.display.quit()
        # self.test_q_learning_real(q)
        RLTrainer.save_obj(q, FILE_NAME)
        # with open ("Q\q_table1.json",'w') as f:
        #    json.dump(q,f,indent=5)
        if self.debug: print(q)
    def test_q_learning_real(self, q_old, starting_nu=0.05):
        # goal_config = [np.random.permutation(self.blocks_count).tolist()]
        nu = starting_nu
        block_world = BlockWorld(self.states_x,
                                 self.states_y,
                                 self.blocks_count,
                                 1,
                                 record=False)
        cnt = 0
        while cnt < self.iteration_count:
            cnt += 1
            block_world.pre_render()

            curr_state = block_world.get_state_as_tuple_pramodith()
            curr_state_s = block_world.get_state_as_state()
            if self.debug:
                print(
                    "State:%s, Q[%s]: %s" %
                    (curr_state_s, curr_state, q_old.get(curr_state, "EMPTY")))
            action, block_id = self.get_next_action(curr_state, q_old, nu)
            if self.debug: print("Action: ", action, block_id)

            next_state = block_world.get_next_state_based_on_state_tuple(
                curr_state, (action, block_id))
            if curr_state_s.goal_reached():
                print("Converged in %d" % cnt)
                return True, cnt
            # if self.debug:

            # print("q:", q_old.get(str(curr_state), None))
            block_world.update_state_from_tuple_pramodith(next_state)

            block_world.render()
            # time.sleep(0.1)
        return False, self.iteration_count