def __init__(self, policy):
		# world object, (starting state is trivial)
		world = World((0,0),(1,1))
		value = {}
		for state in world.allStates():
			value[state] = 0
		discount = 0.9
		delta = 1
		while abs(delta) > 0.00001:
			delta = 0
			for state in world.allStates():
				world.setState(state)
				old = value[state]
				# we can set the minimum to 0 since we know every value will be 0 or positive
				curMax = 0
				for move in world.moveList():
					if world.posAfterMove(move) == (0,0):
						probSum = 10
					else:
						probSum = 0
						for nextState,prob in world.nextPreyStates():
							probSum += prob*discount*value[nextState]
					curMax = max(curMax,probSum)
				value[state] = curMax
				delta = max(delta,abs(old - curMax))
		value[(0,0)] = 10
		self.value 		  = value
		self.actionList   = []
		self.allList  = []
		self.bottomPolicy = policy
		self.discount     = discount
Exemple #2
0
def MCon(episodes, initValue=15,epsilon=0.1, alpha=0.5,discount=0.9):
	# world object, (starting state is trivial)
	world = World((0,0),(1,1))

	# initialize Q value table and Return list for every (s,a)-pair
	Q = {}
	R = {}
	for state in world.allStates():
		for move in world.moveList():
			Q[state,move] = initValue # some value
			R[state,move] = [] # empty list; return = cummulative discounted reward
	steps = [0]*episodes # list counting number of iterations

	for i in range(episodes):
		iterations = 0
		# initialize world
		world.setState((-5,-5))
		stateActionPairs = {}
		# generate an episode using current policy
		while True:
			state = world.position
			# move the predator according to policy
			action = epsGreedyPolicy(state, world, Q, epsilon)
			world.move(action)
			if not (state,action) in stateActionPairs: # store first occurence
				stateActionPairs[(state,action)] = iterations # will be used for discounting
			iterations += 1
			# check if predator caught the prey
			if world.stopState():
				break
			# move the prey (stochasticly)
			world.performPreyMove()
			newState = world.position
		steps[i] = iterations # save amount of iterations needed to catch the prey
		# update Q and R
		for pair in stateActionPairs.keys():
			firstReturn = 10.0*discount**(iterations-stateActionPairs[pair]) # always zero but 10 when episode ends
			R[pair].append(firstReturn)
			Q[pair] = np.mean(np.array(R[pair]))
		# update policy done in epsilon greedy policy code
	return steps
def Qlearning(episodes, policy, startState=(-5,-5), initValue=15,policyParam=0.1, alpha=0.4,discount=0.9):
	# world object, (starting state is trivial)
	world = World((0,0),(1,1))

	# Q value table
	Q = {}
	for state in world.allStates():
		for move in world.moveList():
			Q[state,move] = initValue

	steps = [0]*episodes

	for i in range(episodes):
		iterations = 0
		# initialize world
		world.setState(startState)
		while True:
			state = world.position
			# move the predator according to policy with one parameter (epsilon for E-greedy or Tua for softmax)
			action = policy(state, world, Q, policyParam)
			world.move(action)
			iterations += 1
			# check if predator caught the prey
			if world.stopState():
				# the Q(s,a) update rule (note that the next state is the absorbing state)
				Q[state,action] = Q[state,action] + alpha * (10 - Q[state,action])
				break
			# move the prey (stochasticly)
			world.performPreyMove()
			newState = world.position
			# the maximum value the agent can have after another move
			maxQ = max([Q[newState,nextAction] for nextAction in world.moveList()])
			# the Q(s,a) update rule (note that the immediate reward is zero)
			Q[state,action] = Q[state,action] + alpha * ( discount*maxQ - Q[state,action])
		# print the number of steps the predator took
		steps[i] = iterations
	return steps
Exemple #4
0
def minimax(episodes,initial_state,epsilon, decay, gamma, alpha_pred=1.0, alpha_prey=1.0):
    # initialization might be too expansive
    Q_pred = dict()
    Q_prey = dict()
    V_pred = dict()
    V_prey = dict()
    pi_pred = dict()
    pi_prey = dict()
    initValue = 1.0
    # initialisation
    world = World((5,5),initial_state)
    for state in world.allStates():
      V_pred[state] = 1.0
      V_prey[state] = 1.0
      for action in world.allMoveList():
          pi_pred[(state,action)]=1.0/len(world.allMoveList())
          for prey_move in world.singleMoveList():
              Q_pred[(state, action, prey_move)]=1.0
              Q_prey[(state, action, prey_move)]=1.0
      for action in world.singleMoveList():
          pi_prey[(state,action)]=1.0/len(world.singleMoveList())
    # absorbing states
    terminal_state = tuple([(0,0)] * len(initial_state))
    V_pred[terminal_state] = 0.0
    V_prey[terminal_state] = 0.0

    steps = [0]*episodes
    rewards = [0]*episodes
    for epi in range(episodes):
        
        # initialize world
        world = World((5,5),initial_state)

        # print "Begin Pred", V_pred[world.position]
        # print "End   Prey", V_prey[world.position]
        # for s in world.singleMoveList():
        #     print s, "Pred", V_pred[(s,)]
        #     print s, "Prey", V_pred[(s,)]
        #     for a in world.allMoveList():
        #         for a2 in world.singleMoveList():
        #             print s, "Q", a, a2, Q_pred[(state,a,a2)]

        iterations =0
        while not world.stopState():
            state = world.position
            # choose action
            action_pred = minimax_policy(epsilon, pi_pred, state, world.allMoveList())
            action_prey = minimax_policy(epsilon, pi_prey, state, world.singleMoveList())
            
            reward = world.move(action_prey,action_pred)
            iterations +=1
            new_state = world.position

            # update Q
            # if (state,action_prey) not in Q_prey:
            #     Q_prey[state,action_prey] = initValue
            # if (state,action_pred) not in Q_pred:
            #     Q_pred[state,action_pred] = initValue 
            Q_pred[(state,action_pred,action_prey)] = (1.0-alpha_pred)*Q_pred[(state,action_pred,action_prey)] + alpha_pred*(reward[1]+ gamma* V_pred[new_state])
            Q_prey[(state,action_pred,action_prey)] = (1.0-alpha_prey)*Q_prey[(state,action_pred,action_prey)] + alpha_prey*(reward[0]+ gamma* V_prey[new_state])

            # update pi
            # adapted from example: http://abel.ee.ucla.edu/cvxopt/examples/tutorial/lp.html

            ##  PREDATOR update
            # constraint to minimize w.r.t. prey action
            minConstr   = [[1.0] + [-Q_pred[(state,a_pred,a_prey)] for a_pred in world.allMoveList()] for a_prey in world.singleMoveList()]
            # constrinat to keep every pi(a) positive
            posConstr   = []
            for i in range(1,len(world.allMoveList())+1):
                new_row    = [0.0] * (len(world.allMoveList())+1)
                new_row[i] = -1.0
                posConstr.append(new_row)

            normGreater = [0.0] + [1.0] * len(world.allMoveList())
            normSmaller = [0.0] + [-1.0] * len(world.allMoveList())

            A = matrix([normGreater, normSmaller] + minConstr + posConstr).trans()
            b = matrix([ 1.0, -1.0] + [0.0] * (len(world.singleMoveList()) + len(world.allMoveList())) )
            # -1 V and 0 for all pi(s,a)
            c = matrix([ -1.0 ] + [0.0] * len(world.allMoveList()))

            sol=solvers.lp(c,A,b)

            V_pred[state] = sol['x'][0]
            for a_pred, x in zip(world.allMoveList(),sol['x'][1:]):
                pi_pred[(state,a_pred)] = x

            # ## PREY update
            # constraint to minimize w.r.t. prey action
            minConstr   = [[1.0] + [-Q_prey[(state,a_pred,a_prey)] for a_prey in world.singleMoveList()] for a_pred in world.allMoveList()]
            # # constriant to keep every pi(a) positive
            posConstr   = []
            for i in range(1,len(world.singleMoveList())+1):
                new_row    = [0.0] * (len(world.singleMoveList())+1)
                new_row[i] = -1.0
                posConstr.append(new_row)

            normGreater = [0.0] + [ 1.0] * len(world.singleMoveList())
            normSmaller = [0.0] + [-1.0] * len(world.singleMoveList())

            A = matrix([normGreater, normSmaller] + minConstr + posConstr).trans()
            b = matrix([ 1.0, -1.0] + [0.0] * (len(world.allMoveList()) + len(world.singleMoveList())) )
            # -1 V and 0 for all pi(s,a)
            c = matrix([ -1.0 ] + [0.0] * len(world.singleMoveList()))

            sol=solvers.lp(c,A,b)
        
            V_prey[state] = sol['x'][0]
            for a_prey, x in zip(world.singleMoveList(),sol['x'][1:]):
                pi_prey[(state,a_prey)] = x


            alpha_pred *= decay
            alpha_prey *= decay
        if epi > 0 and epi % 50 == 0:
            print "Episode",epi
        steps[epi]   = iterations
        if reward[1] > 0:
            rewards[epi] = 1
    return steps, rewards
Exemple #5
0
def MCoff(episodes, behaPolicy, matches=[], initValue=15,discount=0.9):
	# behaPolicy = dictionary with keys (state,action) and value P(action|state)


	world = World((0,0),(1,1))
	movelist = world.moveList()
	def policy(world):
		return world.pickElementWithProbs([(move,behaPolicy[(world.position,move)]) for move in movelist])

	# initialize Q value table and Return list for every (s,a)-pair
	Q = {}
	R = {}
	num = {}
	denum = {} 
	for state in world.allStates():
		for move in world.moveList():
			num[state,move] = 0.0
			denum[state,move] = 0.0
			Q[state,move] = float(initValue) # some value
			R[state,move] = [] # empty list; return = cummulative discounted reward
	steps = [0]*episodes # list counting number of iterations
	for epi in range(episodes):
		time = 0
		totalTime =0
		# initialize world
		world.setState((-5,-5))
		
		episode = []
		while True:
			action = policy(world)
			episode.append((world.position, action))
			if action == None:
				print action, state
			world.move(action)
			if world.stopState():
				break
			world.performPreyMove()

		# save the pairs that match, and their first occurence
		matchingHistory = {}
		# last time move was equal to policy
		last = 0
		for i, (state, action) in enumerate(episode[::-1]):
			actionValues = [(Q[state,maction],maction) for maction in world.moveList()]
			bestActions = [actionValues[j][1] for j in maxIndices(actionValues)]
			matchingHistory[(state, action)] = len(episode)-i - 1
			if action not in bestActions:
				last = len(episode)-i
				break
			
		
		matches.append(len(episode)-last)
		
		for (state, action) in matchingHistory:
			if matchingHistory[(state, action)] >= last-1:
				w = np.prod([ 1.0/behaPolicy[episode[j]] for j in range(matchingHistory[(state, action)],len(episode))])
				num[(state,move)]   += w * (10.0*discount**matchingHistory[(state, action)]) # return is gamma^{T-t}*10
				denum[(state,move)] += w
				Q[(state,move)]= num[(state,move)]/float(denum[(state,move)])

		world.setState((-5,-5))
		iterations = 0
		while True:
			iterations += 1
			actionValues = [(maction, Q[state,maction]) for maction in world.moveList()]
			bestAction = random.choice([actionValues[j][0] for j in maxIndices(actionValues)])
			world.move(bestAction)
			if world.stopState() or iterations > 2000:
				break
			world.performPreyMove()
		steps[epi] = iterations
		
			
	return steps