def _forwardImplementation(self, inbuf, outbuf): """ Draws a random number between 0 and 1. If the number is less than epsilon, the action is selected according to the boltzmann exploration strategy. If it is equal or larger than epsilon, the greedy action is returned. """ assert self.module if random.random() < self.epsilon: values = self.module.getActionValues(self._state) action = drawGibbs(values, self.tau) outbuf[:] = array([action]) else: outbuf[:] = inbuf
def _forwardImplementation(self, inbuf, outbuf): """ Draws a random number between 0 and 1. If the number is less than epsilon, a random action is chosen. If it is equal or larger than epsilon, the greedy action is returned. """ assert self.module values = self.module.getActionValues(self._state) action = drawGibbs(values, self.tau) self.tau *= self.decay outbuf[:] = array([action])
def _legalizeIt(self, a): """ draw index from an array of values, filtering out illegal moves. """ if not min(a) >= 0: print(a) print((min(a))) print((self.module.params)) print((self.module.inputbuffer)) print((self.module.outputbuffer)) raise Exception('No positve value in array?') legals = self.game.getLegals(self.color) vals = ones(len(a)) * (-100) * (1 + self.temperature) for i in map(self._convertPosToIndex, legals): vals[i] = a[i] drawn = self._convertIndexToPos(drawGibbs(vals, self.temperature)) assert drawn in legals return drawn
def _legalizeIt(self, a): """ draw index from an array of values, filtering out illegal moves. """ if not min(a) >= 0: print a print min(a) print self.module.params print self.module.inputbuffer print self.module.outputbuffer raise Exception('Non-positive value in array?') legals = self.game.getLegals(self.color) vals = ones(len(a))*(-100)*(1+self.temperature) for i in map(self._convertPosToIndex, legals): vals[i] = a[i] drawn = self._convertIndexToPos(drawGibbs(vals, self.temperature)) assert drawn in legals return drawn
def _legalizeIt(self, a): """ draw index from an array of values, filtering out illegal moves. """ if not min(a) >= 0: #print a #print min(a) #print self.module.params #print self.module.inputbuffer #print self.module.outputbuffer raise Exception('Non-positive value in array?') legals = self.game.getLegals(self.colour) vals = ones(len(a)) * (-100) * (1 + self.temperature) for i in legals: vals[i] = a[i] drawn = drawGibbs(vals, self.temperature) assert drawn in legals return drawn
def performAction(self, action): """ send chosen task to environment to be performed """ if type(action) == np.ndarray: # Take the max, or random among them if there are several equal maxima action = drawGibbs(action, temperature=0) self.steps += 1 # Important to note that we immediately update beliefs after performing action # (maybe not needed but it would make credit-assignment harder) """ # self.current_location = self.env.performAction(action) # Get sensors and do belief updates before calling reward location_beliefs = self.env.sense(action) self.action_counts[ self.env.current_location, action ] += 1 # min(6, self.action_counts[self.env.current_location,action]+1) self.state_counts[self.env.current_location, self.state_ids[self.env.current_state]] += 1 if location_beliefs.success: # print 'selected action: %s' % self.env.action_names[action] self.success = True self.update_beliefs(location_beliefs, action) else: self.success = False # predictions = np.array([np.argmax(obj.joint_prob) for obj in self.objects]) # pred_correct = (predictions == self.env.objects) # self.percent_correct = pred_correct.sum() / float(pred_correct.size) # self.percent_correct /= (self.samples+1) self.percent_correct = np.array( [self.objects[idx].joint_prob[obj_id] for idx, obj_id in enumerate(self.env.objects)] ).sum() / len(self.objects) # Below two lines were cut and pasted from EpisodicTask.performAction() self.updateReward() self.addReward() self.samples += 1
def performAction(self, action): """ send chosen task to environment to be performed """ if type(action) == np.ndarray: # Take the max, or random among them if there are several equal maxima action = drawGibbs(action, temperature=0) self.steps += 1 #Important to note that we immediately update beliefs after performing action # (maybe not needed but it would make credit-assignment harder) """ #self.current_location = self.env.performAction(action) # Get sensors and do belief updates before calling reward location_beliefs = self.env.sense(action) self.action_counts[self.env.current_location,action] += 1 #min(6, self.action_counts[self.env.current_location,action]+1) self.state_counts[self.env.current_location,self.state_ids[self.env.current_state]] += 1 if location_beliefs.success: #print 'selected action: %s' % self.env.action_names[action] self.success = True self.update_beliefs(location_beliefs, action) else: self.success = False #predictions = np.array([np.argmax(obj.joint_prob) for obj in self.objects]) #pred_correct = (predictions == self.env.objects) #self.percent_correct = pred_correct.sum() / float(pred_correct.size) #self.percent_correct /= (self.samples+1) self.percent_correct = np.array([self.objects[idx].joint_prob[obj_id] for idx,obj_id in enumerate(self.env.objects)]).sum() / len(self.objects) # Below two lines were cut and pasted from EpisodicTask.performAction() self.updateReward() self.addReward() self.samples += 1
for test_run in range(num_best_test_runs): #print "RUNNING EP", test_run # perform one episode with trained policy agent.newEpisode() task.reset() task.env.verbose = False rewards = [] step_counter = 0 steps = [] #print 'executing learned policy...' while not task.isFinished(): actionIdx = drawGibbs(agent.learner.wrappingEvaluable.activate( task.getObservation()), temperature=0) task.performAction(actionIdx) rewards.append(task.getReward()) for cur_loc in range(len(task.objects)): joint_probs_learned[ test_run, cur_loc, step_counter] = task.objects[cur_loc].joint_alphas avg_prob_learned[test_run, cur_loc, step_counter] = (np.argmax( task.objects[cur_loc].joint_prob) == task.env.objects[cur_loc]) #print 'l', test_run, cur_loc, step_counter, action_names[actionIdx], object_names[task.env.objects[cur_loc]], avg_prob_learned[test_run,cur_loc,step_counter], avg_prob_learned[test_run,cur_loc] #print 'l', test_run, cur_loc, step_counter, action_names[actionIdx], object_names[task.env.objects[cur_loc]], joint_probs_learned[test_run,cur_loc]
for test_run in range(num_best_test_runs): #print "RUNNING EP", test_run # perform one episode with trained policy agent.newEpisode() task.reset() task.env.verbose = False rewards = [] step_counter = 0 steps = [] #print 'executing learned policy...' while not task.isFinished(): actionIdx = drawGibbs(agent.learner.wrappingEvaluable.activate(task.getObservation()), temperature=0) task.performAction(actionIdx) rewards.append(task.getReward()) for cur_loc in range(len(task.objects)): joint_probs_learned[test_run,cur_loc,step_counter] = task.objects[cur_loc].joint_alphas avg_prob_learned[test_run,cur_loc,step_counter] = (np.argmax(task.objects[cur_loc].joint_prob) == task.env.objects[cur_loc]) #print 'l', test_run, cur_loc, step_counter, action_names[actionIdx], object_names[task.env.objects[cur_loc]], avg_prob_learned[test_run,cur_loc,step_counter], avg_prob_learned[test_run,cur_loc] #print 'l', test_run, cur_loc, step_counter, action_names[actionIdx], object_names[task.env.objects[cur_loc]], joint_probs_learned[test_run,cur_loc] steps.append(action_names[actionIdx]) step_counter += 1 learned_true.append(task.env.objects) learned_steps.append(steps) Ep_rewards.append(rewards)