def convert(self, observations=True): if self.verbose: if observations: print 'Number of features:', 5 * len(self.env._obstypes) initSet = [self.env._initstate] self.states = sorted(flood(self.tryMoves, None, initSet)) dim = len(self.states) if self.verbose: print 'Actual states:', dim print 'Non-zero rewards:', self.rewards print 'Initial state', initSet[0] Ts = [zeros((dim, dim)) for _ in self.env._actionset] R = zeros(dim) statedic = {} actiondic = {} for si, pos in enumerate(self.states): statedic[pos] = si for ai, a in enumerate(self.env._actionset): actiondic[a] = ai for pos, val in self.rewards.items(): R[statedic[pos]] += val for pos, a, dest in self.sas_tuples: ai = actiondic[a] si = statedic[pos] di = statedic[dest] Ts[ai][si, di] += 1. / self.avgOver if self.verbose: print 'Built Ts.' for T in Ts: for ti, row in enumerate(T): if sum(row) > 0: row /= sum(row) else: row[ti] = 1 if self.verbose: print 'Normalized Ts.' if observations: # one observation for current position and each of the 4 neighbors. fMap = zeros((len(self.env._obstypes) * 5, dim)) for si, state in enumerate(self.states): fMap[:, si] = self.env.getSensors(state) if self.verbose: print 'Built features.' return Ts, R, fMap else: return Ts, R
def convert_task_to_mdp(self): # Finds all states, all the while logging transitions and rewards self.states = sorted( flood(self.get_neighbors, None, [self.env.init_game_state])) state_dict = { state: state_i for state_i, state in enumerate(self.states) } # Reward function R(s') R = np.fromiter((self.rewards[state] for state in self.states), dtype=np.double) # Transition matrix A x S x S T = np.zeros((self.env.numActions, len(self.states), len(self.states))) for state, action_i, next_state in self.transitions: # Careful, states are actual states but action_i is an index T[action_i, state_dict[state], state_dict[next_state]] = 1 return T, R