def iter(cls, model, Q):
        V = util.classes.NumMap()
        # Compute V(s) = max_{a} Q(s,a)
        for s in model.S():
            V_s = util.classes.NumMap()
            for a in model.A(s):
                V_s[a] = Q[ (s,a) ]
            if len(V_s) > 0:
                V[s] = V_s.max()
            else:
                V[s] = 0.0
        
        # QQ(s,a) = R(s,a) + gamma*sum_{s'} T(s,a,s')*V(s') 
        QQ = util.classes.NumMap()
        for s in model.S():
            for a in model.A(s):
                value = model.R(s,a)
                T = model.T(s,a)
                value += sum( [model.gamma*t*V[s_prime] for (s_prime,t) in  T.items()] )
                QQ[ (s,a) ] = value

        # to find the log policy, find the argmax at each state and then create a new Q with each (s,a) = oldQ - (max for that state)


        return QQ
def QValueSoftMaxSolve(model, thresh = 1):
    
    v = util.classes.NumMap()
    for s in model.S():
        v[s] = 0.0
        
        
    diff = 100.0
    
    while diff >= thresh:
        vp = v
        
        Q = util.classes.NumMap()
        for s in model.S():
            for a in model.A(s):
                value = model.R(s,a)
                T = model.T(s,a)
                value += sum( [model.gamma*t*v[s_prime] for (s_prime,t) in  T.items()] )
                Q[ (s,a) ] = value            
        
        v = util.classes.NumMap()

        # need the max action for each state!
        for s in model.S():
            maxx = None
            for a in model.A(s):
                if (maxx == None) or Q[(s,a)] > maxx:
                    maxx = Q[(s,a)]


            e_sum = 0
            for a in model.A(s):
                e_sum += math.exp(Q[(s,a)] - maxx)
                
            v[s] = maxx + math.log(e_sum)
        
        diff = max(abs(value - vp[s]) for (s, value) in v.iteritems())
        
        
    logp = util.classes.NumMap()
    for (sa, value) in Q.iteritems():
        logp[sa] = value - v[sa[0]]
    return logp
 def iter(cls, model, Q):
     V = util.classes.NumMap()
     # Compute V(s) = max_{a} Q(s,a)
     for s in model.S():
         V_s = util.classes.NumMap()
         for a in model.A(s):
             V_s[a] = Q[ (s,a) ]
         if len(V_s) > 0:
             V[s] = V_s.max()
         else:
             V[s] = 0.0
     
     # QQ(s,a) = R(s,a) + gamma*sum_{s'} T(s,a,s')*V(s') 
     QQ = util.classes.NumMap()
     for s in model.S():
         for a in model.A(s):
             value = model.R(s,a)
             T = model.T(s,a)
             value += sum( [model.gamma*t*V[s_prime] for (s_prime,t) in  T.items()] )
             QQ[ (s,a) ] = value
     return QQ
    def solve(self, model, true_samples, s_r_prior, gridsize, max_states):
        '''
        Returns a pair (agent, weights) where the agent attempts to generalize
        from the behavior observed in samples and weights is what was combined with
        the MDP to generate agent.
        
        model: an MDP with a linear reward functions.  Parameters WILL be overwritten.
        initial: initial distribution over states
        samples: a list of sample trajectories [ (s_t,a_t) ] of the (supposedly) optimal
            policy.
        '''
        # Initial weight vector
#        w_0 = model.feature_function.params

        R = patrol.rewardbayesian.BayesianPatrolReward(len(model.S())*len(model.A()), gridsize)
        
        model.reward_function = R
        
        pi = self._solver.solve(model)
        
        for i in range(self._max_iter):
            R_tilde = R.randNeighbor()
            
            model.reward_function = R_tilde
            
            Q_pi = self._q_value_solver.solve(model)
            
            found_worse = False
            for history in true_samples:
                for (s, a) in history:
                    
                    if s.location[0] >= 0 and s.location[0] < max_states and Q_pi[(s, pi.actions(s).keys()[0])] < Q_pi[(s, a)]:
#                        print(a, Q_pi[(s, a)], pi.actions(s).keys()[0], Q_pi[(s, pi.actions(s).keys()[0])])
                        found_worse = True
                        break
                
            if found_worse:
                pi_tilde = self._solver.solve(model)
                
                chance = min(1, s_r_prior.prior(pi_tilde, R_tilde) / s_r_prior.prior(pi, R))

                if random.random() < chance:
                    pi = pi_tilde
                    R = R_tilde
            else:
                chance = min(1, s_r_prior.prior(pi, R_tilde) / s_r_prior.prior(pi, R))
                if random.random() < chance:
                    R = R_tilde
                
        model.reward_function = R
        return pi
 def maxEntObjGradient(self, w, model, initial, mu_E, true_samples_len, sa_freq):
     
     if (self.Q_value == None):
         model.reward_function.params = w
         agent = self._solver.solve(model) #shouldn't be doing this!
         
     else:
         policy = {}
         for s in model.S():
             actions = util.classes.NumMap()
             for a in model.A(s):
                 actions[a] = self.Q_value[ (s,a) ]
             policy[s] = actions.argmax()
         agent = mdp.agent.MapAgent(policy)
         
     samples = self.generate_samples(model, agent, initial, true_samples_len)
     _mu = self.feature_expectations(model, samples)
                 
     print(w, mu_E - _mu)
     return -( mu_E - _mu)
 def solve(self, model):
     '''Returns a map of (state, action) => q-value determined by this solver'''
     Q = util.classes.NumMap()
     for i in range(self._max_iter):
         Q = self.iter(model, Q)
         
                 
     returnQ = util.classes.NumMap()
     
     V = util.classes.NumMap()
     # Compute V(s) = max_{a} Q(s,a)
     for s in model.S():
         V_s = util.classes.NumMap()
         for a in model.A(s):
             V_s[a] = Q[ (s,a) ]
         if len(V_s) > 0:
             V[s] = V_s.max()
         else:
             V[s] = 0.0
     
     for (sa, value) in Q.iteritems():
         returnQ[sa] = value - V[sa[0]]
     
     return returnQ
    def solve(self, model, initial, true_samples, other_policy):
        '''
        Returns a pair (agent, weights) where the agent attempts to generalize
        from the behavior observed in samples and weights is what was combined with
        the MDP to generate agent.
        
        model: an MDP with a linear reward functions.  Parameters WILL be overwritten.
        initial: initial distribution over states
        samples: a list of sample trajectories [ (s_t,a_t) ] of the (supposedly) optimal
            policy.
        '''
        # Initial weight vector
#        w_0 = model.feature_function.params
        self.other_policy = other_policy
        self.full_initial = util.classes.NumMap()
        for s in model.S():
            self.full_initial[s] = 1.0
        self.full_initial = self.full_initial.normalize()
                
        # Compute feature expectations of agent = mu_E from samples
        mu_E = self.feature_expectations2(model, true_samples)
        print("True Samples", mu_E)
        # Pick random policy pi^(0)
        agent = mdp.agent.RandomAgent( model.A() )
        
        # Calculate feature expectations of pi^(0) = mu^(0)
        samples = self.generate_samples(model, agent, initial, len(true_samples[0]))
        mu = self.feature_expectations(model, samples )

#        mu = self.feature_expectations2(model, initial, agent )
        lastT = 0
								
        for i in range(self._max_iter):
            # Perform projections to new weights w^(i)
            if i == 0:
                mu_bar = mu
            else:
                mmmb = mu - mu_bar
                mu_bar = mu_bar + numpy.dot( mmmb, mu_E-mu_bar )/numpy.dot( mmmb,mmmb )*mmmb
            w = mu_E - mu_bar
            t = numpy.linalg.norm(mu_E - mu_bar)
            w[0] = abs(w[0])
            print(w)
            model.reward_function.params = w
            
            print 'IRLApproxSolver Iteration #{},t = {:4.4f}'.format(i,t)
            if t < self._epsilon:
                break
            if abs(t - lastT) < .000001:
                break

            lastT = t
            
            # Compute optimal policy used R(s,a) = dot( feature_f(s,a), w^(i) )
            if (numpy.linalg.norm(mu) == 0):
                agent = mdp.agent.RandomAgent( model.A() )
            else:
                agent = self._solver.solve(model)
                        
            # Compute feature expectations of pi^(i) = mu^(i)
            samples = self.generate_samples(model, agent, initial, len(true_samples[0]))
            mu = self.feature_expectations(model, samples)
            print(mu)
#            mu = self.feature_expectations2(model, initial, agent)
            
        # Restore initial weight vector
#        model.feature_function.params = w_0
        return (agent, w)
Esempio n. 8
0
def stupidPythonIdiots():
    global resetPub
    global perceptPub
    global calcPub
    global mdpId
    global states
    global actions

    print(mdpId)
    rospy.wait_for_service('irlsimulate')

    initService()
    initNode()

    print(mdpId)
    p_fail = 0.05
    longHallway = 10
    shortSides = 4
    patrolAreaSize = longHallway + shortSides + shortSides
    observableStateLow = 7
    observableStateHigh = 8

    # calculate farness for each node in the patrolled area
    farness = np.zeros(patrolAreaSize)
    for i in range(patrolAreaSize):
        sum = 0
        for j in range(patrolAreaSize):
            sum += abs(i - j)

        farness[i] = sum

    ## Create reward function
    reward = patrol.reward.PatrolReward(patrolAreaSize, farness,
                                        observableStateLow,
                                        observableStateHigh)
    reward_weights = np.zeros(reward.dim)
    reward_weights[0] = .2
    reward_weights[1] = .35
    reward_weights[2] = .45
    reward_weights[3] = 0
    reward_weights[4] = 0

    reward.params = reward_weights

    ## Create Model
    model = patrol.model.PatrolModel(p_fail, longHallway, shortSides)
    model.reward_function = reward
    model.gamma = 0.999

    states = model.S()
    actions = model.A()

    ## Create initial distribution
    initial = util.classes.NumMap()
    for s in model.S():
        initial[s] = 1.0
    initial = initial.normalize()

    ## Define feature function (approximate methods only)
    #    feature_function = mdp.etc.StateActionFeatureFunction(model)
    #    feature_function = mdp.etc.StateFeatureFunction(model)
    #    feature_function = gridworld.etc.GWLocationFF(model)

    ## Define player
    #    policy = mdp.agent.HumanAgent(model)
    opt_policy = mdp.solvers.ValueIteration(50).solve(model)

    j = 0
    for (s, a, r) in mdp.simulation.simulate(model, opt_policy, initial, 68):
        if (s.location[0] < observableStateLow):
            pass
        elif (s.location[0] > observableStateHigh):
            pass
        else:
            perceptPub.publish(
                percept(mdpId=mdpId,
                        state=stateToId(s),
                        action=actionToId(a),
                        time=j))
        j += 1

    centerObs = util.classes.NumMap()
    for s in model.S():
        centerObs[s] = 0
        if (s.location[0] == (observableStateLow + observableStateHigh) / 2):
            centerObs[s] = 1
    centerObs = centerObs.normalize()
    s = mdpId
    calcPub.publish(String(s))

    raw_input("Percepts Sent, Press Enter to continue...")

    policyPxy = rospy.ServiceProxy('irlpolicy', policy)
    est_p = policyPxy(policyRequest(mdpId))

    est_policy = util.classes.NumMap()
    for (i, a) in enumerate(est_p.policy):
        est_policy[idToState(i)] = idToAction(a)

    mdp.etc.policy_report(opt_policy, est_policy,
                          mdp.solvers.ExactPolicyEvaluator(), model, centerObs)

    for s in model.S():
        print 's = %s, pi*(s) = %s, pi_E(s) = %s' % (s, opt_policy.actions(s),
                                                     est_policy.actions(s))
    print 'pi* and pi_E disagree on {} of {} states'.format(
        len([
            s for s in model.S()
            if opt_policy.actions(s) != est_policy.actions(s)
        ]), len(model.S()))

    simulatePxy = rospy.ServiceProxy('irlsimulate', simulate)
    enc_policy = simulatePxy(simulateRequest(mdpId)).state_actions