def explore_and_exploit(self,ant): ''' Update weights and decide whether to explore or exploit here. Where all the magic happens. YOUR CODE HERE ''' actions = self.world.get_passable_directions(ant.location, AIM.keys()) random.shuffle(actions) if len(actions)==0: return 'halt' # if we have a newborn baby ant, init its rewards and quality fcns if 'prev_value' not in ant.__dict__: ant.prev_value = 0 ant.previous_reward_events = RewardEvents() ant.prev_features = self.features.extract(self.world, self.state, ant.location, actions[0]) return actions[0] # step 1, update Q(s,a) based on going from last state, taking # the action issued last round, and getting to current state R = self.get_reward(ant.previous_reward_events) # step size. it's good to make this inversely proportional to the # number of features, so you don't bounce out of the bowl we're trying # to descend via gradient descent alpha = 0.00001 # totally greedy default value, future rewards count for nothing, do not want discount = 0.00001 # should be max_a' Q(s',a'), where right now we are in state s' and the # previous state was s. You can use # self.value(self.state,ant.location,action) here max_next_action = 'halt' max_next_value = self.value(self.state, ant.location, actions[0]) for action in actions: value = self.value(self.state, ant.location, action) if value > max_next_value: max_next_value = value max_next_action = action # should be argmax_a' Q(s',a') # now that we have all the quantities needed, adjust the weights self.update_weights(alpha,discount,R,max_next_value,ant.prev_value,ant.prev_features) # step 2, explore or exploit? you should replace decide_to_explore with # something sensible based on the number of games played so far, self.ngames explore = 0.7 / self.ngames decision = random.random() if explore >= decision: return actions[0] else: return max_next_action
def explore_and_exploit(self, ant): ''' Update weights and decide whether to explore or exploit here. Where all the magic happens. YOUR CODE HERE ''' actions = self.world.get_passable_directions(ant.location, AIM.keys()) random.shuffle(actions) if len(actions) == 0: return 'halt' # if we have a newborn baby ant, init its rewards and quality fcns if 'prev_value' not in ant.__dict__: ant.prev_value = 0 ant.previous_reward_events = RewardEvents() ant.prev_features = self.features.extract(self.world, self.state, ant.location, actions[0]) return actions[0] # step 1, update Q(s,a) based on going from last state, taking # the action issued last round, and getting to current state R = self.get_reward(ant.previous_reward_events) # step size. it's good to make this inversely proportional to the # number of features, so you don't bounce out of the bowl we're trying # to descend via gradient descent alpha = 0.00001 # totally greedy default value, future rewards count for nothing, do not want discount = 0.00001 # should be max_a' Q(s',a'), where right now we are in state s' and the # previous state was s. You can use # self.value(self.state,ant.location,action) here max_next_action = 'halt' max_next_value = self.value(self.state, ant.location, actions[0]) for action in actions: value = self.value(self.state, ant.location, action) if value > max_next_value: max_next_value = value max_next_action = action # should be argmax_a' Q(s',a') # now that we have all the quantities needed, adjust the weights self.update_weights(alpha, discount, R, max_next_value, ant.prev_value, ant.prev_features) # step 2, explore or exploit? you should replace decide_to_explore with # something sensible based on the number of games played so far, self.ngames explore = 0.7 / self.ngames decision = random.random() if explore >= decision: return actions[0] else: return max_next_action
def explore_and_exploit(self,ant): ''' Update weights and decide whether to explore or exploit here. Where all the magic happens. YOUR CODE HERE ''' actions = self.world.get_passable_directions(ant.location, AIM.keys()) random.shuffle(actions) if len(actions)==0: return 'halt' # if we have a newborn baby ant, init its rewards and quality fcns if 'prev_value' not in ant.__dict__: ant.prev_value = 0 ant.previous_reward_events = RewardEvents() ant.prev_features = self.features.extract(self.world, self.state, ant.location, actions[0]) return actions[0] # step 1, update Q(s,a) based on going from last state, taking # the action issued last round, and getting to current state R = self.get_reward(ant.previous_reward_events) # step size. it's good to make this inversely proportional to the # number of features, so you don't bounce out of the bowl we're trying # to descend via gradient descent alpha = 0.01 / (len(self.weights)) # totally greedy default value, future rewards count for nothing, do not want discount = 1.0 # should be max_a' Q(s',a'), where right now we are in state s' and the # previous state was s. You can use # self.value(self.state,ant.location,action) here # SO WHY NOT JUST PUT THAT IN THE CODE INSTEAD OF LEAVING A CRYPTIC COMMENT!? (max_next_value, max_next_action) = max_by(actions, lambda x: self.value(self.state,ant.location,x)) # now that we have all the quantities needed, adjust the weights self.update_weights(alpha,discount,R,max_next_value,ant.prev_value,ant.prev_features) # step 2, explore or exploit? you should replace decide_to_explore with # something sensible based on the number of games played so far, self.ngames decide_to_explore = None if self.ngames < explore_start: decide_to_explore = True elif self.ngames < explore_stop: p = 1.0 * (explore_stop - self.ngames) / (explore_stop - explore_start) decide_to_explore = random.random() < p else: decide_to_explore = False if decide_to_explore: return actions[0] else: return max_next_action
def get_direction(self, ant): '''Finds a direction for this ant to move in according to the food, enemy, exploration routine.''' # Get the list of directions towards food, enemy, and random rand_dirs = AIM.keys() random.shuffle(rand_dirs) dirs = (ant.toward(ant.closest_food()) + ant.toward(ant.closest_enemy()) + rand_dirs) # Get the first passable direction from that long list. d = ant.get_passable_direction(dirs) return d
def get_successors(self,loc): ''' Returns a list of valid next reachable locations from the input LOC. All derived classes should use this function, otherwise testing your implementation might fail. ''' alldirs = AIM.keys() s = [] for d in alldirs: l = self.world.next_position(loc, d) if self.world.passable(l): s.append(l) return s
def get_direction(self, ant): """Evaluates each of the currently passable directions and picks the one with maximum value.""" # get the passable directions, in random order to break ties rand_dirs = self.world.get_passable_directions(ant.location, AIM.keys()) random.shuffle(rand_dirs) # evaluate the value function for each possible direction value = [0 for i in range(0, len(rand_dirs))] max_value = float("-inf") max_dir = None for i in range(0, len(rand_dirs)): value[i] = self.value(self.state, ant.location, rand_dirs[i]) if value[i] > max_value: max_value = value[i] max_dir = rand_dirs[i] # take direction with maximum value # Get the first passable direction from that long list. self.world.L.info("Chose: %s, value: %.2f" % (max_dir, max_value)) return max_dir
def get_direction(self, state, ant): '''Returns the ant's least-visited adjacent location, prioritizing by food direction when multiple adjacent locations are equally explored.''' # Of the 4 possible squares to move to, determine which don't currently # contain an ant and are the least-visited. min_visits = float('Inf') min_visits_directions = [] for direction in AIM.keys(): test_position = ant.world.next_position(ant.location, direction) # Ignore water. if not ant.world.passable(test_position): continue # Don't move to a currently occupied location; # this helps somewhat mitigate collisions. if ant.world.ant_lookup[test_position] != -1: continue # Check to see how frequently this candidate location has been visited # in the past. num_visits = state[test_position] if state.has_key( test_position) else 1 if num_visits < min_visits: min_visits = num_visits min_visits_directions = [direction] elif num_visits == min_visits: min_visits_directions.append(direction) if not min_visits_directions: # Will only reach here if ant is boxed in by water on all sides. return None elif len(min_visits_directions) > 1: # Try to break ties by considering food direction. food_directions = ant.toward(ant.closest_food()) for fd in food_directions: if fd in min_visits_directions: return fd return min_visits_directions[0]
def get_direction(self, state, ant): '''Returns the ant's least-visited adjacent location, prioritizing by food direction when multiple adjacent locations are equally explored.''' # Of the 4 possible squares to move to, determine which don't currently # contain an ant and are the least-visited. min_visits = float('Inf') min_visits_directions = [] for direction in AIM.keys(): test_position = ant.world.next_position(ant.location, direction) # Ignore water. if not ant.world.passable(test_position): continue # Don't move to a currently occupied location; # this helps somewhat mitigate collisions. if ant.world.ant_lookup[test_position] != -1: continue # Check to see how frequently this candidate location has been visited # in the past. num_visits = state[test_position] if state.has_key(test_position) else 1 if num_visits < min_visits: min_visits = num_visits min_visits_directions = [direction] elif num_visits == min_visits: min_visits_directions.append(direction) if not min_visits_directions: # Will only reach here if ant is boxed in by water on all sides. return None elif len(min_visits_directions) > 1: # Try to break ties by considering food direction. food_directions = ant.toward(ant.closest_food()) for fd in food_directions: if fd in min_visits_directions: return fd return min_visits_directions[0]