Beispiel #1
0
    def value_iteration(self):
        policy: Dict[FarmState, List[float]] = {}
        for state in self.states:
            policy[state] = [1.0, 0.0, 0.0, 0.0]

        def _update():
            self._update_state_vals_color()
            self._update_state_vals_text()
            self._update_policy(policy)
            self.window.update()

        _update()

        change: float = np.inf
        itr: int = 0

        while change > 0:
            change, self.state_vals = value_iteration_step(
                self.env, self.states, self.state_vals, self.discount)
            itr += 1

            for state in self.states:
                action: int = get_action(self.env, state, self.state_vals,
                                         self.discount)
                policy[state] = [0.0, 0.0, 0.0, 0.0]
                policy[state][action] = 1.0

            if self.wait > 0.0:
                time.sleep(self.wait)
                _update()

            print("VI Itr: %i, Delta: %E" % (itr, change))

        _update()

        # save answer
        actions: List[int] = []
        for state in self.states:
            action: int = get_action(self.env, state, self.state_vals,
                                     self.discount)
            actions.append(action)

        # pickle.dump((self.state_vals, actions, itr), open("value_iteration.pkl", "wb"), protocol=-1)

        print("DONE")
Beispiel #2
0
    def policy_iteration(self, num_eval_itrs: int, wait_eval: float):
        policy: Dict[FarmState, List[float]] = {}
        for state in self.states:
            policy[state] = [0.25, 0.25, 0.25, 0.25]

        def _update():
            self._update_state_vals_color()
            self._update_state_vals_text()
            self._update_policy(policy)
            self.window.update()

        _update()

        state_vals_all: List[Dict] = []

        policy_changed: bool = True
        itr: int = 0
        while policy_changed:
            # policy evaluation
            self.policy_evaluation(num_eval_itrs, policy, wait_eval)

            # policy improvement
            policy_new: Dict[FarmState, List[float]] = {}
            for state in self.states:
                action: int = get_action(self.env, state, self.state_vals,
                                         self.discount)
                policy_new[state] = [0.0, 0.0, 0.0, 0.0]
                policy_new[state][action] = 1.0

            # check for convergence
            policy_changed = policy != policy_new
            policy = policy_new
            itr += 1

            # visualize
            if self.wait > 0.0:
                _update()
                time.sleep(self.wait)

            print("Policy iteration itr: %i" % itr)
            state_vals_all.append(self.state_vals.copy())

        _update()

        import pickle
        pickle.dump(state_vals_all,
                    open(
                        "policy_itr_state_vals_all_rand_right%.1f.pkl" %
                        self.env.rand_right_prob, "wb"),
                    protocol=-1)

        print("DONE")
Beispiel #3
0
        def _update():
            nnet.eval()
            for state_i in self.states:
                nnet_input_np_state = self.env.state_to_nnet_input(state_i)
                nnet_input = torch.tensor(nnet_input_np_state, device=device)
                state_val: float = nnet(
                    nnet_input.float()).cpu().data.numpy()[0, 0]
                self.state_vals[state_i] = state_val

            for state_i in self.states:
                action: int = get_action(self.env, state_i, self.state_vals,
                                         self.discount)
                policy[state_i] = [0.0, 0.0, 0.0, 0.0]
                policy[state_i][action] = 1.0

            self._update_state_vals_color()
            self._update_state_vals_text()
            self._update_policy(policy)
            self.window.update()