Exemple #1
0
 def _forwardImplementation(self, inbuf, outbuf):
     """ Draws a random number between 0 and 1. If the number is less
         than epsilon, the action is selected according to the boltzmann exploration strategy. If it is equal or
         larger than epsilon, the greedy action is returned.
     """
     assert self.module
     if random.random() < self.epsilon:
         values = self.module.getActionValues(self._state)
         action = drawGibbs(values, self.tau)
         outbuf[:] = array([action])
     else:
         outbuf[:] = inbuf
 def _forwardImplementation(self, inbuf, outbuf):
     """ Draws a random number between 0 and 1. If the number is less
         than epsilon, a random action is chosen. If it is equal or
         larger than epsilon, the greedy action is returned.
     """
     assert self.module 
     
     values = self.module.getActionValues(self._state)
     action = drawGibbs(values, self.tau)
     
     self.tau *= self.decay
     
     outbuf[:] = array([action])
    def _forwardImplementation(self, inbuf, outbuf):
        """ Draws a random number between 0 and 1. If the number is less
            than epsilon, a random action is chosen. If it is equal or
            larger than epsilon, the greedy action is returned.
        """
        assert self.module

        values = self.module.getActionValues(self._state)
        action = drawGibbs(values, self.tau)

        self.tau *= self.decay

        outbuf[:] = array([action])
Exemple #4
0
 def _legalizeIt(self, a):
     """ draw index from an array of values, filtering out illegal moves. """
     if not min(a) >= 0:
         print(a)
         print((min(a)))
         print((self.module.params))
         print((self.module.inputbuffer))
         print((self.module.outputbuffer))
         raise Exception('No positve value in array?')
     legals = self.game.getLegals(self.color)
     vals = ones(len(a)) * (-100) * (1 + self.temperature)
     for i in map(self._convertPosToIndex, legals):
         vals[i] = a[i]
     drawn = self._convertIndexToPos(drawGibbs(vals, self.temperature))
     assert drawn in legals
     return drawn
 def _legalizeIt(self, a):
     """ draw index from an array of values, filtering out illegal moves. """
     if not min(a) >= 0:
         print a
         print min(a)
         print self.module.params
         print self.module.inputbuffer
         print self.module.outputbuffer
         raise Exception('Non-positive value in array?')
     legals = self.game.getLegals(self.color)
     vals = ones(len(a))*(-100)*(1+self.temperature)
     for i in map(self._convertPosToIndex, legals):
         vals[i] = a[i]
     drawn = self._convertIndexToPos(drawGibbs(vals, self.temperature))
     assert drawn in legals
     return drawn
 def _legalizeIt(self, a):
     """ draw index from an array of values, filtering out illegal moves. """
     if not min(a) >= 0:
         #print a
         #print min(a)
         #print self.module.params
         #print self.module.inputbuffer
         #print self.module.outputbuffer
         raise Exception('Non-positive value in array?')
     legals = self.game.getLegals(self.colour)
     vals = ones(len(a)) * (-100) * (1 + self.temperature)
     for i in legals:
         vals[i] = a[i]
     drawn = drawGibbs(vals, self.temperature)
     assert drawn in legals
     return drawn
Exemple #7
0
    def performAction(self, action):
        """
        send chosen task to environment to be performed
        """
        if type(action) == np.ndarray:
            # Take the max, or random among them if there are several equal maxima
            action = drawGibbs(action, temperature=0)

        self.steps += 1
        # Important to note that we immediately update beliefs after performing action
        # (maybe not needed but it would make credit-assignment harder) """
        # self.current_location = self.env.performAction(action)

        # Get sensors and do belief updates before calling reward
        location_beliefs = self.env.sense(action)

        self.action_counts[
            self.env.current_location, action
        ] += 1  # min(6, self.action_counts[self.env.current_location,action]+1)
        self.state_counts[self.env.current_location, self.state_ids[self.env.current_state]] += 1

        if location_beliefs.success:
            # print 'selected action: %s' % self.env.action_names[action]

            self.success = True
            self.update_beliefs(location_beliefs, action)
        else:
            self.success = False

        # predictions = np.array([np.argmax(obj.joint_prob) for obj in self.objects])
        # pred_correct = (predictions == self.env.objects)
        # self.percent_correct = pred_correct.sum() / float(pred_correct.size)
        # self.percent_correct /= (self.samples+1)
        self.percent_correct = np.array(
            [self.objects[idx].joint_prob[obj_id] for idx, obj_id in enumerate(self.env.objects)]
        ).sum() / len(self.objects)

        # Below two lines were cut and pasted from EpisodicTask.performAction()
        self.updateReward()
        self.addReward()
        self.samples += 1
Exemple #8
0
 def performAction(self, action):
     """
     send chosen task to environment to be performed
     """
     if type(action) == np.ndarray:
         # Take the max, or random among them if there are several equal maxima
         action = drawGibbs(action, temperature=0)
         
     self.steps += 1
     #Important to note that we immediately update beliefs after performing action
     # (maybe not needed but it would make credit-assignment harder) """
     #self.current_location = self.env.performAction(action)
     
     # Get sensors and do belief updates before calling reward
     location_beliefs = self.env.sense(action)
     
     self.action_counts[self.env.current_location,action] += 1 #min(6, self.action_counts[self.env.current_location,action]+1)
     self.state_counts[self.env.current_location,self.state_ids[self.env.current_state]] += 1
     
     if location_beliefs.success:
         #print 'selected action: %s' % self.env.action_names[action]
         
         self.success = True
         self.update_beliefs(location_beliefs, action)
     else:
         self.success = False
         
     #predictions = np.array([np.argmax(obj.joint_prob) for obj in self.objects])
     #pred_correct = (predictions == self.env.objects)
     #self.percent_correct = pred_correct.sum() / float(pred_correct.size)
     #self.percent_correct /= (self.samples+1)
     self.percent_correct = np.array([self.objects[idx].joint_prob[obj_id] for idx,obj_id in enumerate(self.env.objects)]).sum() / len(self.objects)
     
     # Below two lines were cut and pasted from EpisodicTask.performAction()
     self.updateReward()
     self.addReward()
     self.samples += 1
    for test_run in range(num_best_test_runs):
        #print "RUNNING EP", test_run

        # perform one episode with trained policy
        agent.newEpisode()
        task.reset()
        task.env.verbose = False
        rewards = []
        step_counter = 0
        steps = []

        #print 'executing learned policy...'

        while not task.isFinished():
            actionIdx = drawGibbs(agent.learner.wrappingEvaluable.activate(
                task.getObservation()),
                                  temperature=0)
            task.performAction(actionIdx)
            rewards.append(task.getReward())

            for cur_loc in range(len(task.objects)):
                joint_probs_learned[
                    test_run, cur_loc,
                    step_counter] = task.objects[cur_loc].joint_alphas
                avg_prob_learned[test_run, cur_loc,
                                 step_counter] = (np.argmax(
                                     task.objects[cur_loc].joint_prob) ==
                                                  task.env.objects[cur_loc])
                #print 'l', test_run, cur_loc, step_counter, action_names[actionIdx], object_names[task.env.objects[cur_loc]], avg_prob_learned[test_run,cur_loc,step_counter], avg_prob_learned[test_run,cur_loc]
                #print 'l', test_run, cur_loc, step_counter, action_names[actionIdx], object_names[task.env.objects[cur_loc]], joint_probs_learned[test_run,cur_loc]
 
 for test_run in range(num_best_test_runs):
     #print "RUNNING EP", test_run
     
     # perform one episode with trained policy
     agent.newEpisode()
     task.reset()
     task.env.verbose = False
     rewards = []
     step_counter = 0
     steps = []
     
     #print 'executing learned policy...'
     
     while not task.isFinished():
         actionIdx = drawGibbs(agent.learner.wrappingEvaluable.activate(task.getObservation()), temperature=0)
         task.performAction(actionIdx)
         rewards.append(task.getReward())
         
         for cur_loc in range(len(task.objects)):
             joint_probs_learned[test_run,cur_loc,step_counter] = task.objects[cur_loc].joint_alphas
             avg_prob_learned[test_run,cur_loc,step_counter] = (np.argmax(task.objects[cur_loc].joint_prob) == task.env.objects[cur_loc])
             #print 'l', test_run, cur_loc, step_counter, action_names[actionIdx], object_names[task.env.objects[cur_loc]], avg_prob_learned[test_run,cur_loc,step_counter], avg_prob_learned[test_run,cur_loc]
             #print 'l', test_run, cur_loc, step_counter, action_names[actionIdx], object_names[task.env.objects[cur_loc]], joint_probs_learned[test_run,cur_loc]
             
         steps.append(action_names[actionIdx])
         step_counter += 1
         
     learned_true.append(task.env.objects)
     learned_steps.append(steps)
     Ep_rewards.append(rewards)