Ejemplo n.º 1
0
class Agent:

    def __init__(self):
        self.actions = ["up", "down", "left", "right"]
        self.num_actions = len(self.actions)
        self.grid_world = GridWorld()

        # initial state reward
        self.state_values = {}
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                self.state_values[(i, j)] = 0  # set initial value to 0
        self.state_indices = {}
        k = 0
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                self.state_indices[(i, j)] = k  # set initial value to 0
                k += 1

        self.num_states = len(self.state_values)
        self.state_values_vec = np.zeros((self.num_states))
        self.rewards = np.zeros((self.num_states))

        self.state_transition_prob = np.zeros((self.num_states, self.num_actions, self.num_states))
        #self.get_observation_by_random(20000)

        self.discount = 0.99
    def count_base_reward(self, state):
        """
        To count state, reward in observation sequence

        """
        count_s = [exp for exp in self.observation if exp[0] == state]
        count_s_r = [exp[2] for exp in self.observation if exp[0] == state]
        try:
            r = sum(count_s_r)/len(count_s)
        except Exception as e:
            r = 0
        print(sum(count_s_r))
        return r

    def count_base_prob(self, state, action, nxtState):
        """
        To count state, action nxtState in observation sequence

        """
        # m -1 loop
        count_s_a = [ exp for exp in self.observation[:len(self.observation)-1] if exp[0] == state and exp[1] == action]
        count_s_a_nxt = [ (i,exp, self.observation[i+1][0]) for i,exp in enumerate(self.observation[:len(self.observation)-1]) if exp[0] == state and exp[1] == action and self.observation[i+1][0] == nxtState ]
        if action == 'down' and nxtState == (1,0):
            print("down and (1,0)")
        try:
            p = len(count_s_a_nxt)/len(count_s_a)
        except Exception as e:
            p = 0
        print(count_s_a)
        return p


    def get_observation_by_random(self, num_of_samples):
        """
        Get observation sequecne with random moving agent

        """
        observation = [] #Observe experience s1, r1, a1, s2, r2, a2, sm, rm, am
        for _ in range(num_of_samples):
            s_a_r = list()
            r_action = self.choose_random_action()
            s_a_r.append(self.grid_world.state)
            s_a_r.append(r_action)
            reward = self.grid_world.giveReward()
            s_a_r.append(reward)

            print("current position {} action {}".format(self.grid_world.state, r_action))
            # by taking the action, it reaches the next state
            self.grid_world = self.takeAction(r_action)
            #
            if(reward == 1):
                print("reward 1")
            observation.append(s_a_r)

        self.observation = observation
        self.make_transition_matrix()

    def make_transition_matrix(self):
        """
        To calulate Transition matrix p(s'|s,a)

        """
        for state in self.state_values.keys():
            self.rewards[self.state_indices[state]] = self.count_base_reward(state)
            for action in self.actions:
                for nxtState in self.state_values.keys():
                    # p, r
                    p = self.count_base_prob(state, action, nxtState)
                    self.state_transition_prob[
                        self.state_indices[state], self.actions.index(action), self.state_indices[nxtState]] += p

    def takeAction(self, action):
        position = self.grid_world.nxtPosition(action)
        # update GridWorld
        return GridWorld(state=position)

    def choose_random_action(self):
        # choose_random_action
        action = np.random.choice(self.actions)
        return action


    def update(self):

        # Compute the action values $Q(s,a)$

        _action_values = np.repeat(self.rewards, 4).reshape((self.num_states, 4)) + self.discount * np.sum(
            self.state_transition_prob * self.state_values_vec, axis=2, keepdims=False
        )

        # Evaluate the deterministic policy $\pi(s)$
        self.policy = np.argmax(_action_values, axis=1)

        # Compute the values $V(s)$
        values = np.max(_action_values, axis=1, keepdims=False)

        # Compute the value difference $|\V_{k}-V_{k+1}|\$ for check the convergence
        diff = np.linalg.norm(self.state_values_vec[:] - values[:])

        # Update the current value estimate
        self.state_values_vec = values

        return diff, values

    def nxtPosition(self, state, action):
        if action == "up":
            nxtState = (state[0] - 1, state[1])
        elif action == "down":
            nxtState = (state[0] + 1, state[1])
        elif action == "left":
            nxtState = (state[0], state[1] - 1)
        else:
            nxtState = (state[0], state[1] + 1)
        if (nxtState[0] >= 0) and (nxtState[0] <= 2):
            if (nxtState[1] >= 0) and (nxtState[1] <= 3):
                if nxtState != (1, 1):
                    return nxtState
        return state

    # def giveReward(self, state):
    #     if state == WIN_STATE:
    #         return 1
    #     elif state == LOSE_STATE:
    #         return -1
    #     else:
    #         return 0

    def fit(self, max_iteration=1e3, tolerance=1e-3, verbose=False, logging=False):

        if logging:
            history = []

        # Value iteration loop
        for _iter in range(1, int(max_iteration + 1)):

            # Update the value estimate
            diff, values = self.update()
            if logging:
                history.append(diff)
            if verbose:
                print('Iteration: {0}\tValue difference: {1}'.format(_iter, diff))

            # Check the convergence
            if diff < tolerance:
                if verbose:
                    print('Converged at iteration {0}.'.format(_iter))
                break

        if logging:
            return diff, history, values, self.policy
        else:
            return diff
Ejemplo n.º 2
0
class Agent:

    def __init__(self):
        self.states = []  # record position and action taken at the position
        self.actions = ["up", "down", "left", "right"]
        self.grid_world = GridWorld()
        self.isEnd = self.grid_world.isEnd
        self.lr = 0.2
        self.exp_rate = 0.3
        self.decay_gamma = 0.9

        # initial Q values
        self.Q_values = {}
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                self.Q_values[(i, j)] = {}
                for a in self.actions:
                    self.Q_values[(i, j)][a] = 0  # Q value is a dict of dict

    def chooseAction(self):
        # choose action with most expected value
        mx_nxt_reward = 0
        action = ""

        if np.random.uniform(0, 1) <= self.exp_rate:
            action = np.random.choice(self.actions)
        else:
            # greedy action
            for a in self.actions:
                current_position = self.grid_world.state
                nxt_reward = self.Q_values[current_position][a]
                if nxt_reward >= mx_nxt_reward:
                    action = a
                    mx_nxt_reward = nxt_reward
            # print("current pos: {}, greedy aciton: {}".format(self.grid_world.state, action))
        return action

    def takeAction(self, action):
        position = self.grid_world.nxtPosition(action)
        # update GridWorld
        return GridWorld(state=position)

    def reset(self):
        self.states = []
        self.grid_world = GridWorld()
        self.isEnd = self.grid_world.isEnd

    def play(self, rounds=10):
        i = 0
        while i < rounds:
            # to the end of game back propagate reward
            if self.grid_world.isEnd:
                # back propagate
                reward = self.grid_world.giveReward()
                for a in self.actions:
                    self.Q_values[self.grid_world.state][a] = reward
                print("Game End Reward", reward)
                for s in reversed(self.states):
                    current_q_value = self.Q_values[s[0]][s[1]]
                    reward = current_q_value + self.lr * (self.decay_gamma * reward - current_q_value)
                    self.Q_values[s[0]][s[1]] = round(reward, 3)
                self.reset()
                i += 1
            else:
                action = self.chooseAction()
                # append trace
                self.states.append([(self.grid_world.state), action])
                print("current position {} action {}".format(self.grid_world.state, action))
                # by taking the action, it reaches the next state
                self.grid_world = self.takeAction(action)
                # mark is end
                self.grid_world.isEndFunc()
                print("nxt state", self.grid_world.state)
                print("---------------------")
                self.isEnd = self.grid_world.isEnd
Ejemplo n.º 3
0
class Agent:

    def __init__(self):
        self.states = []  # record position and action taken at the position
        self.actions = ["up", "down", "left", "right"]
        self.grid_world = GridWorld()
        self.isEnd = self.grid_world.isEnd
        self.lr = 0.2
        self.exp_rate = 0.3#-1#0.3#0.3#0.3
        self.decay_gamma = 0.9

        # initial Q values
        self.Q_values = {}
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                self.Q_values[(i, j)] = {}
                for a in self.actions:
                    self.Q_values[(i, j)][a] = 0  # Q value is a dict of dict
        self.state_values_vec = np.zeros((len(self.Q_values)))
        self.policy_list = list()

    def chooseAction(self):
        # choose action with most expected value
        mx_nxt_reward = -np.inf
        action = ""

        if np.random.uniform(0, 1) <= self.exp_rate:
            action = np.random.choice(self.actions)
        else:
            # greedy action
            for a in self.actions:
                current_position = self.grid_world.state
                nxt_reward = self.Q_values[current_position][a]
                if nxt_reward >= mx_nxt_reward:
                    action = a
                    mx_nxt_reward = nxt_reward
            # print("current pos: {}, greedy aciton: {}".format(self.grid_world.state, action))
        return action

    def takeAction(self, action):
        position = self.grid_world.nxtPosition(action)
        # update GridWorld
        return GridWorld(state=position)

    def reset(self):
        self.states = []
        self.grid_world = GridWorld()
        self.isEnd = self.grid_world.isEnd

    def play(self, rounds=10):
        i = 0
        histories = list()
        data_sample = 0
        while i < rounds:
            # to the end of game back propagate reward
            if self.grid_world.isEnd:
                # back propagate
                reward = self.grid_world.giveReward()
                for a in self.actions:
                    self.Q_values[self.grid_world.state][a] = reward
                #print("Game End Reward", reward)
                next_state = (self.grid_world.state, 'right')
                for s in reversed(self.states):
                    next_q_value = self.Q_values[next_state[0]][next_state[1]]
                    current_q_value = self.Q_values[s[0]][s[1]]
                    #reward = current_q_value + self.lr * (self.decay_gamma * reward - current_q_value)
                    reward = current_q_value + self.lr * (self.decay_gamma * next_q_value - current_q_value)
                    self.Q_values[s[0]][s[1]] = round(reward, 3)
                    next_state = s

                self.policy_list.append(self.make_policy_table_from_q())
                self.reset()

                i += 1
                aa = [[k for e, k in e.items()] for e in [v for k, v in self.Q_values.items()]]
                ab = np.array(aa, dtype=float)
                q_value_numpy  = np.max(ab, axis=1, keepdims=False)
                diff = np.linalg.norm(self.state_values_vec[:] - q_value_numpy[:])
                self.state_values_vec = q_value_numpy
                #print("iter {0}".format(i))
                #print("diff {0}".format(diff))
                #print("qvalue {0}".format(np.linalg.norm(q_value_numpy[:])))
                print("{0}".format(data_sample))
                value_scalar= np.linalg.norm(q_value_numpy[:])
                histories.append((value_scalar, diff, int(data_sample)))

                data_sample = 0
            else:
                action = self.chooseAction()
                # append trace
                self.states.append([(self.grid_world.state), action]) #s, a
                #print("current position {} action {}".format(self.grid_world.state, action))
                # by taking the action, it reaches the next state
                self.grid_world = self.takeAction(action)
                # mark is end
                self.grid_world.isEndFunc()
                #print("nxt state", self.grid_world.state)
                #print("---------------------{0}".format(i))
                self.isEnd = self.grid_world.isEnd
                data_sample += 1

        return histories, self.policy_list

    def make_policy_table_from_q(self):
        convert_list = list()
        for k, v in self.Q_values.items():
            convert_list.append(np.argmax(np.array([val for (key, val) in v.items()], float)))
        return convert_list