Example #1
0
    def reward(self, state, action):
        if state == 0:
            return 0.0
        else:
            return -1.0

    def get_actions(self, state):
        return [i for i in range(self.action_size)]

    def print_evaluation(self):
        print("value matrix")
        for i in range(4):
            for j in range(4):
                state_idx = (i*4+j) % 15
                print("{:.3f}".format(self.v[state_idx]), end=' ')
            print()
    
    def print_improvement(self):
        print("policy matrix")
        for i in range(4):
            for j in range(4):
                state_idx = (i*4+j) % 15
                print(self.p[state_idx], end=' ')
            print()

if __name__ == "__main__":
    test = myAgent(15, 4, 1.0)
    method = DP.algo(test, threshold=0.0001)
    method.policy_evaluation(show=True)
    # method.policy_iteration(show=True)
    # method.value_iteration(show=True)
Example #2
0
        num1 = state // 21
        num2 = state % 21
        actions = []
        for a in range(self.action_size):
            moved = a - 5
            if num1 + moved < 0 or num2 - moved < 0: continue
            actions.append(a)
        return actions

    def print_evaluation(self):
        print("value matrix")
        for i in range(21):
            for j in range(21):
                state_idx = (i * 21 + j)
                print("{:.3f}".format(self.v[state_idx]), end=' ')
            print()

    def print_improvement(self):
        print("policy matrix")
        for i in range(21):
            for j in range(21):
                state_idx = (i * 21 + j)
                print(self.p[state_idx], end=' ')
            print()


if __name__ == "__main__":
    agent = Agent(21 * 21, 11, 0.9)
    method = DP.algo(agent, 0.0001)
    method.policy_iteration()  #show=True)