def val_cal(self, state, action, gradient=False): """ the feature is constructed by (tiles, action) this func has two procedure: 1: get the feature 2: cal the value of the (state, action) pair :par |state: instance for observation |action: int, for discrete number |gradient: bool, if true, then return the feature """ #print('the state is', [8*(state[0]+1.2)/(0.6+1.2), 8*(state[1]+0.07)/(0.07+0.07)] ) #print(action,[round(8*(state[0]+1.2)/(0.6+1.2), 3), # round(8*(state[1]+0.07)/(0.07+0.07), 3)], 'the val cal state is') feature = tc.tiles( self.iht, NUM_OF_TILINGS, [8 * (state[0]) / (0.6 + 1.2), 8 * (state[1]) / (0.07 + 0.07)], [action]) #feature_gradient = tc.tiles(self.iht, 8, [8*(state[0]+self.eps_gradient[0])/(0.6+1.2), # 8*(state[1]+self.eps_gradient[1])/(0.07+0.07)], [action]) #feature = tc.tiles(self.iht, 8, [round(8*(state[0]+1.2)/(0.6+1.2), 3), # round(8*(state[1]+0.07)/(0.07+0.07), 3)], [action]) #feature = [8*(state[0]+1.2)/(0.6+1.2), 8*(state[1]+0.07)/(0.07+0.07)] if gradient: return feature val = sum(self.weights[feature]) return val
def estimate(self, state, action): pos, vel, action = self._test_input(state, action) active_tiles = tile_coding.tiles( self._hashtable, self._num_of_tillings, [self._pos_scale * pos, self._vel_scale * vel], [action]) return np.sum(self._weights[active_tiles])
def __init__(self): self.actionSet = action.ActionSet self.dynaQUpdates = 50 self.alpha = 1 #0.5 # learning rate self.gamma = 0.9 # future discount # Set up the tiling based on a standard random seed random.seed() self.numTilings = 3 #8 self.numDimensions = 2 #6 maxVal = 1000 vals = [0 for i in range(self.numDimensions)] self.iht = tc.IHT(maxVal) # Initialize with 0s to standardize randomness of hashing across experiments tc.tiles(self.iht, self.numTilings, vals) self.history = History() # We need to store the s-a value where our state is the 8d tiling # and action is a change in acceleration # we WISH we could do this in an np array, but it would be a 1024^8 size array (where each entry was the action values) # self.Q = np.zeros([self.numTilings, len(self.actionSet)]) # Maybe a dictionary will work? # self.Q = {} # self.StateReward = {} # self.TilingToState = {} # Currently a full Q array is ~2.4gb self.Q = self.setupQ(maxVal) self.model = self.setupModel(maxVal) self.visitedStates = set() # self.visitedStates = dict() # self.visitedStatesCount = dict() # self.visitedStatesModelCount = dict() self.normFirst = 0 self.dynaFirst = 0
def __call__(self, s): """ return the value of given state; V_hat(s) input: state output: value of the given state """ #Determine tile index feature_vector = tiles(self.w.shape[0], self.num_tilings, s) estimated_value = np.sum(self.w[feature_vector]) return estimated_value
def get_feature(state, action): hash_table = iht num_tilings = tile_n position_scale = num_tilings / (max_position - min_position) velocity_scale = num_tilings / (max_velocity - min_velocity) position, velocity = state indexs = tiles(hash_table, num_tilings, [position_scale * position, velocity_scale * velocity], [action]) feature = [0 for _ in range(feat_n)] for index in indexs: feature[index] = 1 return feature
def update(self, state, action, target): pos, vel, action = self._test_input(state, action) assert pos < 0.5 # this should never be called on terminal state active_tiles = tile_coding.tiles( self._hashtable, self._num_of_tillings, [self._pos_scale * pos, self._vel_scale * vel], [action]) est = np.sum(self._weights[active_tiles]) delta = self._step_size * (target - est) for tile in active_tiles: self._weights[tile] += delta
def getActiveTiles(self, d_state, s_action, agent): ''' get indices of active tiles for given state and action :param d_state: dictionary. the intern state representation of an agent :param s_action: string. a valid action :param agent: Agent object. the agent using the value function ''' action = agent.d_translate_to_valuefun[s_action] l_features_values = [] for s_key in agent.features_names: f_min = self.d_normalizers[s_key]['MIN'] f_value = max(0., d_state[s_key] - f_min) if d_state[s_key] > self.d_normalizers[s_key]['MAX']: f_value = self.d_normalizers[s_key]['MAX'] f_value -= self.d_normalizers[s_key]['MIN'] f_value *= self.featuresScale[s_key] l_features_values.append(f_value) activeTiles = tile_coding.tiles(self.hashTable, self.numOfTilings, l_features_values, [action]) return activeTiles
def getActiveTiles(self, d_state, s_action, agent): ''' get indices of active tiles for given state and action :param d_state: dictionary. the intern state representation of an agent :param s_action: string. a valid action :param agent: Agent object. the agent using the value function ''' action = agent.d_translate_to_valuefun[s_action] l_features_values = [] for s_key in agent.features_names: f_min = self.d_normalizers[s_key]['MIN'] f_value = max(0., d_state[s_key] - f_min) if d_state[s_key] > self.d_normalizers[s_key]['MAX']: f_value = self.d_normalizers[s_key]['MAX'] f_value -= self.d_normalizers[s_key]['MIN'] f_value *= self.featuresScale[s_key] l_features_values.append(f_value) activeTiles = tile_coding.tiles(self.hashTable, self.numOfTilings, l_features_values, [action]) return activeTiles
def getTile(self, state): if (len(state) != self.numDimensions): print("ERROR: unexpected state size for tiling") return None else: return tc.tiles(self.iht, self.numTilings, state)
def get_tile(x, y, action=[]): return tiles(iht, numTilings, [ numTilings * x / (position_max - position_min), numTilings * y / (velocity_max - velocity_min) ], action)
def tile_code(S, A, iht=IHT(4096), num_tilings=8): position, velocity = S return tiles(iht, num_tilings, [num_tilings * position / (0.5 + 1.2), num_tilings * velocity / (0.07 + 0.07)], [A])
def s2f(self, s): #Determine tile index feature_vector = tiles(self.w.shape[0], self.num_tilings, s) return feature_vector