Beispiel #1
0
    def random_walk(self):

        print("Robot can perform following actions: {}".format(
            self.action_list))
        while True:

            if api.is_terminal_state(self.current_state):
                print "Goal Reached"
                break

            possible_actions = api.get_possible_actions(self.current_state)
            print("Possible actions in current state: {}".format(
                possible_actions))

            for action in possible_actions:
                print "Action {}".format(action)
                if action == "pick":  # try to pick book 1
                    action_params = {"book_name": "book_1"}
                elif action == "place":
                    action_params = {
                        "book_name": "book_1",
                        "bin_name": "trolly_2"
                    }
                else:
                    action_params = {}

                states = api.get_possible_states(self.current_state, action,
                                                 action_params)
                print "Possible states are:"
                for state in states:
                    next_state = states[state][0]
                    probability = states[state][1]
                    print state
                    print "State: ", next_state
                    print "Probability: ", probability
                    print "Reward: ", api.get_reward(self.current_state,
                                                     action, next_state)
                    print ""

            idx = random.randint(0, len(possible_actions) - 1)
            chosen_action = possible_actions[idx]
            if chosen_action == "pick":  # try to pick book 1
                action_params = {"book_name": "book_1"}
            elif chosen_action == "place":
                action_params = {"book_name": "book_1", "bin_name": "trolly_2"}
            else:
                action_params = {}

            print "Executing action: {} with params: {}".format(
                chosen_action, action_params)

            success, next_state = api.execute_action(chosen_action,
                                                     action_params)
            if success == 1:
                print "Successfully executed"
            else:
                print "Action failed"

            self.current_state = next_state
            print "updated current state:"
            print self.current_state

            raw_input("\nPress Enter to continue execution...")
Beispiel #2
0
    def task5(self, episodes):
        '''for running the simulation after training'''
        q_values = {}
        # Your code here

        actions_json_file = '/action_config.json'

        with open(self.root_path + actions_json_file) as json_file:
            try:
                self.action_reference = json.load(json_file, parse_float=float)
            except (ValueError, KeyError, TypeError):
                print "JSON error"

        self.book_penalty = -100
        self.bump_penalty = -100

        # =============================================================================
        #         q tables initialized to zero
        # =============================================================================
        #(x,y, orientation, c1,c2,tbot_near,action) #c1,c2 will be zero if available, and one if picked up

        q1 = np.load('q_table_r1_6.npy')
        q2 = np.load('q_table_r2_6.npy')
        # =============================================================================
        #       Create Agents
        # =============================================================================

        agent1_books = [1, 2, 3]
        agent2_books = [4, 5, 6]

        agent1 = Agent('robot1', q1, agent1_books, more_books=True)
        agent2 = Agent('robot2', q2, agent2_books, more_books=True)

        tbot_list = [agent1, agent2]

        R_cumulative = {agent1.name: [], agent2.name: []}

        # =============================================================================
        #       acting
        # =============================================================================

        tbot_active = agent1
        tbot_passive = agent2
        epsilon = 0
        # a single episode
        api.reset_world()
        R_cumulative_active = 0
        R_cumulative_passive = 0
        initial_state = api.get_current_state()
        current_state = initial_state

        state_active = tbot_active.dict_to_np_state(
            current_state, tbot_passive)  #active bots state tuple
        state_passive = tbot_passive.dict_to_np_state(
            current_state, tbot_active)  #passive bots state tuple
        #pdb.set_trace()

        while not api.is_terminal_state(current_state):

            through_api = True  # flag for going through API for an action, if False, then reward is given manually

            # =============================================================================
            #               active tbot acts and learns
            # =============================================================================
            #pick action for tbot_active
            #choose either random or exploit, according to epsilon=epsilon_calc(epsilon_initial, epsilon_decay, i)
            # =============================================================================
            #                 if state_active[0]>=5 or state_passive[0]>=5:
            #                     pdb.set_trace()
            # =============================================================================
            action_A, action_items_A, action_params_A, action_string_A = self.choose_action(
                tbot_active, epsilon, state_active)  #selects action

            through_api, next_state, reward = self.reward_prune(
                current_state, state_active, state_passive, action_A,
                action_items_A, tbot_active, tbot_passive
            )  #prunes by checking for invalid actions, in which case we don't run through environment_api
            #pdb.set_trace()
            if through_api:
                success, next_state = api.execute_action(
                    action_A, action_params_A, tbot_active.name)
                reward = api.get_reward(current_state, action_A, next_state)

            R_cumulative_active += reward

            next_state_active = tbot_active.dict_to_np_state(
                next_state, tbot_passive)

            #update q_values of tbot_active ONLY

            state_action_idx = tuple(state_active) + tuple(
                [tbot_active.idx_to_action.index(action_string_A)])

            current_state = next_state  # udpate current state so the other tbot knows the updated state before choosing an action etc.

            state_active = tbot_active.dict_to_np_state(
                current_state, tbot_passive)  #active bots state tuple
            state_passive = tbot_passive.dict_to_np_state(
                current_state, tbot_active)  #passive bots state tuple

            # =============================================================================
            #               passive tbot acts and does NOT learn
            # =============================================================================
            action_P, action_items_P, action_params_P, action_string_P = self.choose_action(
                tbot_passive, epsilon, state_passive)

            through_api, next_state, reward = self.reward_prune(
                current_state, state_passive, state_active, action_P,
                action_items_P, tbot_passive,
                tbot_active)  #reward won't be used

            if through_api:
                success, next_state = api.execute_action(
                    action_P, action_params_P, tbot_passive.name)
                reward = api.get_reward(current_state, action_P, next_state)

            R_cumulative_passive += reward
            current_state = next_state  # udpate current state for active tbot

            state_active = tbot_active.dict_to_np_state(
                current_state, tbot_passive)  #active bots state tuple
            state_passive = tbot_passive.dict_to_np_state(
                current_state, tbot_active)  #passive bots state tuple

        R_cumulative[tbot_active.name].append(R_cumulative_active)
        R_cumulative[tbot_passive.name].append(R_cumulative_passive)
        print(R_cumulative)
Beispiel #3
0
if __name__ == "__main__":
    args = parser.parse_args()

    if args.command == 'get_current_state':
        print(api.get_current_state())
    elif args.command == 'is_terminal_state':
        current_state = api.get_current_state()
        print(api.is_terminal_state(current_state))
    elif args.command == 'reset_world':
        print(api.reset_world())
    elif args.command == 'get_all_actions':
        print(api.get_all_actions())
    elif args.command == 'get_possible_actions':
        current_state = api.get_current_state()
        print(api.get_possible_actions(current_state))
    elif args.command == 'get_possible_states':
        current_state = api.get_current_state()
        for robot in current_state['robots'].keys():
            for action in api.get_possible_actions(current_state, robot):
                print(
                    api.get_possible_states(current_state, action,
                                            {'robot': robot}))
    elif args.command == 'execute_action':
        current_state = api.get_current_state()
        success, next_state = api.execute_action(args.action, {})
        print(success)
        print(api.get_reward(current_state, args.action, next_state))
    elif args.command == 'get_path':
        print(api.get_path('robot0', (0.0, -2.0)))
Beispiel #4
0
    def task4(self, episodes):
        ''' train with 3 books for each tbot'''
        #pdb.set_trace()
        q_values = {}

        actions_json_file = '/action_config.json'

        with open(self.root_path + actions_json_file) as json_file:
            try:
                self.action_reference = json.load(json_file, parse_float=float)
            except (ValueError, KeyError, TypeError):
                print "JSON error"


# =============================================================================
#       episode parameters
# =============================================================================
#there are actually 2*episodes episodes, since there are two tbots
        episode_update = 2  #the amount of episodes a tbot will train while the other tbots policy remains constant. must be a divisor of episodes
        episode_blocks = int(
            episodes / episode_update
        )  #number of episode blocks, each episode block one tbot is updating their q table while the other only acts
        # =============================================================================
        #       epsilon parameters & set penalty values
        # =============================================================================
        epsilon_initial = .95
        epsilon_decay = .002
        epsilon_calc = lambda epsilon_initial, epsilon_decay, i: max(
            0.05, epsilon_initial - epsilon_decay * i)

        self.book_penalty = -100
        self.bump_penalty = -100

        # =============================================================================
        #         q tables initialized to zero
        # =============================================================================
        q1 = np.zeros(
            (7, 7, 4, 2, 2, 2, 2, 2, 2, 5, 9)
        )  #(x,y, orientation, c1,c2,c3,c4,c5,c6,tbot_near,action) #c1,c2 will be zero if available, and one if picked up
        q2 = np.zeros((7, 7, 4, 2, 2, 2, 2, 2, 2, 5, 9))

        # =============================================================================
        #       Create Agents
        # =============================================================================

        agent1_books = [1, 2, 3]
        agent2_books = [4, 5, 6]

        agent1 = Agent('robot1', q1, agent1_books, more_books=True)
        agent2 = Agent('robot2', q2, agent2_books, more_books=True)

        tbot_list = [agent1, agent2]

        R_cumulative = {agent1.name: [], agent2.name: []}

        # =============================================================================
        #       acting and training
        # =============================================================================

        for i in range(episode_blocks):
            epsilon = epsilon_calc(
                epsilon_initial, epsilon_decay,
                i)  #epsilon can only changes every episode block
            for tbot in tbot_list:  #determines which tbot is learning, active updates table, passive does not
                tbot_active = tbot
                tbot_passive_set = set(tbot_list) - set([tbot])
                tbot_passive = tbot_passive_set.pop()
                for e in range(
                        episode_update
                ):  #cycle through the episodes inside an episode block
                    epsilon = epsilon_calc(epsilon_initial, epsilon_decay,
                                           i * episode_update + e)
                    # a single episode
                    api.reset_world()
                    R_cumulative_active = 0
                    R_cumulative_passive = 0
                    initial_state = api.get_current_state()
                    current_state = initial_state

                    state_active = tbot_active.dict_to_np_state(
                        current_state, tbot_passive)  #active bots state tuple
                    state_passive = tbot_passive.dict_to_np_state(
                        current_state, tbot_active)  #passive bots state tuple
                    #pdb.set_trace()
                    print('episode_block {0} episode {1} for tbot {2}'.format(
                        i, e, tbot_active.name))
                    while not api.is_terminal_state(current_state):

                        through_api = True  # flag for going through API for an action, if False, then reward is given manually

                        # =============================================================================
                        #               active tbot acts and learns
                        # =============================================================================
                        #pick action for tbot_active
                        #choose either random or exploit, according to epsilon=epsilon_calc(epsilon_initial, epsilon_decay, i)
                        # =============================================================================
                        #                 if state_active[0]>=5 or state_passive[0]>=5:
                        #                     pdb.set_trace()
                        # =============================================================================
                        action_A, action_items_A, action_params_A, action_string_A = self.choose_action(
                            tbot_active, epsilon,
                            state_active)  #selects action

                        through_api, next_state, reward = self.reward_prune(
                            current_state, state_active, state_passive,
                            action_A, action_items_A, tbot_active, tbot_passive
                        )  #prunes by checking for invalid actions, in which case we don't run through environment_api
                        #pdb.set_trace()
                        if through_api:
                            success, next_state = api.execute_action(
                                action_A, action_params_A, tbot_active.name)
                            reward = api.get_reward(current_state, action_A,
                                                    next_state)

                        R_cumulative_active += reward

                        next_state_active = tbot_active.dict_to_np_state(
                            next_state, tbot_passive)

                        #update q_values of tbot_active ONLY

                        state_action_idx = tuple(state_active) + tuple(
                            [tbot_active.idx_to_action.index(action_string_A)])
                        tbot_active.q[state_action_idx] = (
                            1 - self.alpha
                        ) * tbot_active.q[state_action_idx] + self.alpha * (
                            reward +
                            self.gamma * max(tbot_active.q[next_state_active]))

                        current_state = next_state  # udpate current state so the other tbot knows the updated state before choosing an action etc.

                        state_active = tbot_active.dict_to_np_state(
                            current_state,
                            tbot_passive)  #active bots state tuple
                        state_passive = tbot_passive.dict_to_np_state(
                            current_state,
                            tbot_active)  #passive bots state tuple

                        # =============================================================================
                        #               passive tbot acts and does NOT learn
                        # =============================================================================
                        action_P, action_items_P, action_params_P, action_string_P = self.choose_action(
                            tbot_passive, epsilon, state_passive)

                        through_api, next_state, reward = self.reward_prune(
                            current_state, state_passive, state_active,
                            action_P, action_items_P, tbot_passive,
                            tbot_active)  #reward won't be used

                        if through_api:
                            success, next_state = api.execute_action(
                                action_P, action_params_P, tbot_passive.name)
                            reward = api.get_reward(current_state, action_P,
                                                    next_state)

                        R_cumulative_passive += reward
                        current_state = next_state  # udpate current state for active tbot

                        state_active = tbot_active.dict_to_np_state(
                            current_state,
                            tbot_passive)  #active bots state tuple
                        state_passive = tbot_passive.dict_to_np_state(
                            current_state,
                            tbot_active)  #passive bots state tuple

                    R_cumulative[tbot_active.name].append(R_cumulative_active)
                    R_cumulative[tbot_passive.name].append(
                        R_cumulative_passive)
                    print(R_cumulative)
                    print('on the {0}th episode is {1}'.format(
                        i * episode_update + e, epsilon))
        np.save('q_table_r1_6.npy', agent1.q)
        np.save('q_table_r2_6.npy', agent2.q)

        import pickle
        with open("robot1_rewards_6.txt", "wb") as f:
            pickle.dump(R_cumulative['robot1'], f)

        with open("robot2_rewards_6.txt", "wb") as f:
            pickle.dump(R_cumulative['robot2'], f)