def reward(self, state, action): if state == 0: return 0.0 else: return -1.0 def get_actions(self, state): return [i for i in range(self.action_size)] def print_evaluation(self): print("value matrix") for i in range(4): for j in range(4): state_idx = (i*4+j) % 15 print("{:.3f}".format(self.v[state_idx]), end=' ') print() def print_improvement(self): print("policy matrix") for i in range(4): for j in range(4): state_idx = (i*4+j) % 15 print(self.p[state_idx], end=' ') print() if __name__ == "__main__": test = myAgent(15, 4, 1.0) method = DP.algo(test, threshold=0.0001) method.policy_evaluation(show=True) # method.policy_iteration(show=True) # method.value_iteration(show=True)
num1 = state // 21 num2 = state % 21 actions = [] for a in range(self.action_size): moved = a - 5 if num1 + moved < 0 or num2 - moved < 0: continue actions.append(a) return actions def print_evaluation(self): print("value matrix") for i in range(21): for j in range(21): state_idx = (i * 21 + j) print("{:.3f}".format(self.v[state_idx]), end=' ') print() def print_improvement(self): print("policy matrix") for i in range(21): for j in range(21): state_idx = (i * 21 + j) print(self.p[state_idx], end=' ') print() if __name__ == "__main__": agent = Agent(21 * 21, 11, 0.9) method = DP.algo(agent, 0.0001) method.policy_iteration() #show=True)