def update(self, state, action, nextState, reward):
        self.states_to_backup.add((state, action, nextState, reward))

        print(len(self.mdp.getStates()))

        changed_states = []
        for state, action_, next_state_, reward in self.states_to_backup:
            # compute max q-val over all actions
            max_qval = -100000
            self.action_vals[state] = []
            for action in self.mdp.getPossibleActions(state):
                qval = self.computeQValueFromValues(
                    state,
                    action,
                    reward=reward if action == action_ else None)
                if qval > max_qval:
                    max_qval = qval
                    #self.best_actions[state] = action # store best action for policy
                    if self.values[state] != max_qval:
                        changed_states.append(
                            (state, action_, next_state_, reward))
                    self.values[
                        state] = max_qval  # update value of current state to be the max
                self.action_vals[state].append((qval, action))

            self.num_state_updates += 1

            if self.num_state_updates == self.state_update_epoch:
                avg_return = self.compute_avg_return()
                fileio.append(avg_return, "avg_returns_RTDP")
                self.num_state_updates = 0

        self.states_to_backup = set()
Exemple #2
0
def battery(capacity, status):
    t = datetime.datetime.now().isoformat()
    line = '{0} {1}% {2}\n'.format(t, capacity, status)

    fileio.append(line, LOG_BATTERY_ALL_FILE)
    fileio.append(line, LOG_BATTERY_FILE)

    lines_threshold = config.get_entry('log_capacity_lines_limit',
                                       default_value=None)
    fileio.remove_front_lines_if_too_many(LOG_BATTERY_FILE, lines_threshold)
    def __init__(self,
                 mdp,
                 environment,
                 discount=0.9,
                 iterations=100,
                 display=None):
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()
        self.environment = environment

        self.action_vals = defaultdict(list)
        for state in self.mdp.getStates():
            self.action_vals[state] = [
                (0, action) for action in self.mdp.getPossibleActions(state)
            ]

        self.num_state_updates = 0
        self.state_update_epoch = 10  #len(self.mdp.getStates())

        for i in range(self.iterations):
            print("Iteration", i)
            # copy the current values.
            new_values = util.Counter()

            # for each state
            for j, state in enumerate(self.mdp.getStates()):
                # compute max q-val over all actions
                max_qval = -100000
                self.action_vals[state] = []
                for action in self.mdp.getPossibleActions(state):
                    qval = self.computeQValueFromValues(state, action)
                    if qval > max_qval:
                        max_qval = qval
                        new_values[
                            state] = max_qval  # update value of current state to be the max
                    self.action_vals[state].append((qval, action))
                display.displayValues(self,
                                      state,
                                      "CURRENT VALUES",
                                      showActions=False)
                if i == 0 and j == 0: input()

                self.num_state_updates += 1
                if self.num_state_updates == self.state_update_epoch:
                    avg_return = self.compute_avg_return()
                    fileio.append(avg_return, "avg_returns_Value Iteration")
                    self.num_state_updates = 0

            self.values = new_values
Exemple #4
0
        # now do a time-bounded episode without updating any state-values
        #ret = runEpisode(a, env, opts.discount, decisionCallback, displayCallback, messageCallback, pauseCallback, episode, update=False, bounded=True)
        #agent_name = opts.agent
        #fileio.append((timestep, ret), "results/returns_learning_" + agent_name + "_" + str(opts.learningRate) + "_" + str(opts.epsilon) + "_" + str(opts.stepn) + "_" + str(opts.sigma))
        
    if opts.episodes > 0:
        print
        print("AVERAGE RETURNS FROM START STATE: "+str((returns+0.0) / opts.episodes))
        print
        print
    
    # now gather stats
    #print("Gathering stats...")
    for episode in range(1, opts.post_eps+1):
        timestep, ret = runEpisode(a, env, opts.discount, decisionCallback, None, messageCallback, pauseCallback, episode, update=False)
        agent_name = opts.agent
        fileio.append((timestep, ret), "results/returns_post_" + agent_name + "_" + str(opts.learningRate) + "_" + str(opts.epsilon) + "_" + str(opts.stepn) + "_" + str(opts.sigma))

    '''
    # DISPLAY POST-LEARNING VALUES / Q-VALUES
    if (opts.agent in ['q', 'n_step_sarsa', 'n_step_expected_sarsa', 'tree_backup', 'qsigma']) and not opts.manual:
        try:
            display.displayQValues(a, message = "Q-VALUES AFTER "+str(opts.episodes)+" EPISODES")
            display.pause()
            input("")
            display.displayValues(a, message = "VALUES AFTER "+str(opts.episodes)+" EPISODES")
            display.pause()
            input("")
        except KeyboardInterrupt:
            sys.exit(0)
    '''