def update(self, state, action, nextState, reward): self.states_to_backup.add((state, action, nextState, reward)) print(len(self.mdp.getStates())) changed_states = [] for state, action_, next_state_, reward in self.states_to_backup: # compute max q-val over all actions max_qval = -100000 self.action_vals[state] = [] for action in self.mdp.getPossibleActions(state): qval = self.computeQValueFromValues( state, action, reward=reward if action == action_ else None) if qval > max_qval: max_qval = qval #self.best_actions[state] = action # store best action for policy if self.values[state] != max_qval: changed_states.append( (state, action_, next_state_, reward)) self.values[ state] = max_qval # update value of current state to be the max self.action_vals[state].append((qval, action)) self.num_state_updates += 1 if self.num_state_updates == self.state_update_epoch: avg_return = self.compute_avg_return() fileio.append(avg_return, "avg_returns_RTDP") self.num_state_updates = 0 self.states_to_backup = set()
def battery(capacity, status): t = datetime.datetime.now().isoformat() line = '{0} {1}% {2}\n'.format(t, capacity, status) fileio.append(line, LOG_BATTERY_ALL_FILE) fileio.append(line, LOG_BATTERY_FILE) lines_threshold = config.get_entry('log_capacity_lines_limit', default_value=None) fileio.remove_front_lines_if_too_many(LOG_BATTERY_FILE, lines_threshold)
def __init__(self, mdp, environment, discount=0.9, iterations=100, display=None): self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() self.environment = environment self.action_vals = defaultdict(list) for state in self.mdp.getStates(): self.action_vals[state] = [ (0, action) for action in self.mdp.getPossibleActions(state) ] self.num_state_updates = 0 self.state_update_epoch = 10 #len(self.mdp.getStates()) for i in range(self.iterations): print("Iteration", i) # copy the current values. new_values = util.Counter() # for each state for j, state in enumerate(self.mdp.getStates()): # compute max q-val over all actions max_qval = -100000 self.action_vals[state] = [] for action in self.mdp.getPossibleActions(state): qval = self.computeQValueFromValues(state, action) if qval > max_qval: max_qval = qval new_values[ state] = max_qval # update value of current state to be the max self.action_vals[state].append((qval, action)) display.displayValues(self, state, "CURRENT VALUES", showActions=False) if i == 0 and j == 0: input() self.num_state_updates += 1 if self.num_state_updates == self.state_update_epoch: avg_return = self.compute_avg_return() fileio.append(avg_return, "avg_returns_Value Iteration") self.num_state_updates = 0 self.values = new_values
# now do a time-bounded episode without updating any state-values #ret = runEpisode(a, env, opts.discount, decisionCallback, displayCallback, messageCallback, pauseCallback, episode, update=False, bounded=True) #agent_name = opts.agent #fileio.append((timestep, ret), "results/returns_learning_" + agent_name + "_" + str(opts.learningRate) + "_" + str(opts.epsilon) + "_" + str(opts.stepn) + "_" + str(opts.sigma)) if opts.episodes > 0: print print("AVERAGE RETURNS FROM START STATE: "+str((returns+0.0) / opts.episodes)) print print # now gather stats #print("Gathering stats...") for episode in range(1, opts.post_eps+1): timestep, ret = runEpisode(a, env, opts.discount, decisionCallback, None, messageCallback, pauseCallback, episode, update=False) agent_name = opts.agent fileio.append((timestep, ret), "results/returns_post_" + agent_name + "_" + str(opts.learningRate) + "_" + str(opts.epsilon) + "_" + str(opts.stepn) + "_" + str(opts.sigma)) ''' # DISPLAY POST-LEARNING VALUES / Q-VALUES if (opts.agent in ['q', 'n_step_sarsa', 'n_step_expected_sarsa', 'tree_backup', 'qsigma']) and not opts.manual: try: display.displayQValues(a, message = "Q-VALUES AFTER "+str(opts.episodes)+" EPISODES") display.pause() input("") display.displayValues(a, message = "VALUES AFTER "+str(opts.episodes)+" EPISODES") display.pause() input("") except KeyboardInterrupt: sys.exit(0) '''