コード例 #1
ファイル: TeamPrey.py プロジェクト: nagyistge/UvA-MasterAI-AA
class TeamPrey():
    def __init__(self, Environment, location):
        self.Environment = Environment
        # Initialize Q
        alpha = 0.3
        gamma = 0.7
        epsilon = 0.1
        self.Prey = Prey(Environment, location)
        self.actions = Prey.actions
        self.TeamQLearning = TeamQLearning(self, alpha, gamma, epsilon)

    def updateQ(self, s, a, o, s_prime, r):
        Update this teams Q and V.
        O = self.actions
        A = self.actions

        # Use linear programming to obtain optimal policy for this state
            # Create a new model
            m = grb.Model("MultiAgentMinimax")
            m.setParam("OutputFlag", 0)
            # Create variables
            pi = dict()
            for a in A:
                pi[a] = m.addVar(0.0,

            # Integrate new variables

            # Set objective
                grb.LinExpr([(self.TeamQLearning.Q[s][(a, o)], pi[a])
                             for o in O for a in A]), grb.GRB.MAXIMIZE)

            # Add constraint: Sum_a pi(a) = 1
            expr = grb.quicksum(m.getVars())
            m.addConstr(expr == 1, "Total probability")

            # Add more constraints
            for o in O:
                expr = grb.LinExpr([(self.TeamQLearning.Q[s][(a, o)], pi[a])
                                    for a in A])
                m.addConstr(expr >= 0)


            for a in A:
                self.TeamQLearning.policy[s][a] = pi[a].x

        except grb.GurobiError:
            print 'Error reported'

        # Update Q and V
        self.TeamQLearning.updateQ(s, a, o, s_prime, r)

    def getActionEpsilonGreedy(self, s):
        # Find the (joint) action that maximizes Q[(s, a)]
        prob_actions = dict()
        uniform_epsilon = self.TeamQLearning.epsilon / (len(self.actions))

        for possible_a in self.actions:
            # Set probabilities of all actions uniformly
            prob_actions[possible_a] = uniform_epsilon

        best_a = argmax(self.TeamQLearning.policy[s])
        prob_actions[best_a] += 1 - self.TeamQLearning.epsilon

        # For every action, check if the cumulative probability exceeds a
        # random number.
        random_number = random.random()
        cumulative_prob = 0.0

        for a in self.actions:
            cumulative_prob += prob_actions[a]
            if cumulative_prob >= random_number:
                return a

    def performAction(self, a):

    def permutations(self, iterable, r=None):
        iterator <- permutations(iterable, r)        
        Finds permutations of iterable of length r, with duplicate entries.  	
        pool = tuple(iterable)
        n = len(pool)
        r = n if r is None else r

        for indices in product(range(n), repeat=r):
            if len(indices) == r:
                yield tuple(pool[i] for i in indices)
コード例 #2
ファイル: TeamPrey.py プロジェクト: camielv/UvA-MasterAI-AA
class TeamPrey():

    def __init__(self, Environment, location):
        self.Environment = Environment
        # Initialize Q
        alpha = 0.3
        gamma = 0.7
        epsilon = 0.1
        self.Prey = Prey(Environment, location)
        self.actions = Prey.actions
        self.TeamQLearning = TeamQLearning(self, alpha, gamma, epsilon)
    def updateQ(self, s, a, o, s_prime, r):
        Update this teams Q and V.
        O = self.actions        
        A = self.actions
        # Use linear programming to obtain optimal policy for this state
            # Create a new model
            m = grb.Model("MultiAgentMinimax")
            # Create variables
            pi = dict()
            for a in A:
                pi[a] = m.addVar( 0.0, 1.0, vtype = grb.GRB.CONTINUOUS, name = str(a) )
            # Integrate new variables
            # Set objective
            m.setObjective( grb.LinExpr( [ ( self.TeamQLearning.Q[s][(a,o)], pi[a] ) for o in O for a in A ] ), grb.GRB.MAXIMIZE)
            # Add constraint: Sum_a pi(a) = 1
            expr = grb.quicksum( m.getVars() )
            m.addConstr( expr == 1, "Total probability" )
            # Add more constraints
            for o in O:
                expr = grb.LinExpr( [ (self.TeamQLearning.Q[s][(a,o)], pi[a]) for a in A ] )
                m.addConstr( expr >= 0 )
            for a in A:
                self.TeamQLearning.policy[s][a] = pi[a].x
        except grb.GurobiError:
            print 'Error reported'
        # Update Q and V
        self.TeamQLearning.updateQ(s, a, o, s_prime, r)

    def getActionEpsilonGreedy(self, s):
        # Find the (joint) action that maximizes Q[(s, a)]                
        prob_actions = dict()        
        uniform_epsilon = self.TeamQLearning.epsilon / (len(self.actions))
        for possible_a in self.actions:
            # Set probabilities of all actions uniformly
            prob_actions[possible_a] = uniform_epsilon
        best_a = argmax( self.TeamQLearning.policy[s] )
        prob_actions[best_a] += 1 - self.TeamQLearning.epsilon
        # For every action, check if the cumulative probability exceeds a 
        # random number. 
        random_number = random.random()
        cumulative_prob = 0.0
        for a in self.actions:
            cumulative_prob += prob_actions[a]
            if cumulative_prob >= random_number:                
                return a
    def performAction(self, a):
    def permutations(self, iterable, r=None):	  	
        iterator <- permutations(iterable, r)        
        Finds permutations of iterable of length r, with duplicate entries.  	
        pool = tuple(iterable)	  	
        n = len(pool)	
        r = n if r is None else r
        for indices in product(range(n), repeat=r):  	
            if len(indices) == r:
                yield tuple(pool[i] for i in indices)