Python World.allStates Examples

Programming Language: Python

Namespace/Package Name: world

Class/Type: World

Method/Function: allStates

Examples at hotexamples.com: 5

Python World.allStates - 5 examples found. These are the top rated real world Python examples of world.World.allStates extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

World(30)

stopState(7)

add(7)

__init__(6)

__str__(6)

moveList(5)

setState(5)

allStates(5)

buildLineSegmentTestWorld(4)

buildEllipse(4)

from_string(4)

singleMoveList(4)

perform_action(4)

allMoveList(4)

get_player(3)

performPreyMove(3)

addPlayer(3)

buildLineSegmentWorld(3)

nextPreyStates(2)

loadStartArea(2)

add_element(2)

init_schedules(2)

addEnemy(2)

from_snapshot(2)

add_garden(2)

loadSimulation(2)

addObject(2)

addCell(2)

apply_command(2)

to_s(2)

del_player(2)

__create__(2)

addArea(2)

buildAccelArrow(2)

get_location(2)

buildAccelSphere(2)

spawn_predator(2)

spawn_prey(2)

startSimulation(2)

buildGlobalGoal(2)

stopSimulation(2)

turn(2)

marking_fox(1)

marking_rabbit(1)

newGame(1)

moveEye(1)

move_snake(1)

offX(1)

next_gen(1)

occupied(1)

Example #1

Show file

File: Assignment2.3.py Project: HarrieO/Autonomous-Agents

	def __init__(self, policy):
		# world object, (starting state is trivial)
		world = World((0,0),(1,1))
		value = {}
		for state in world.allStates():
			value[state] = 0
		discount = 0.9
		delta = 1
		while abs(delta) > 0.00001:
			delta = 0
			for state in world.allStates():
				world.setState(state)
				old = value[state]
				# we can set the minimum to 0 since we know every value will be 0 or positive
				curMax = 0
				for move in world.moveList():
					if world.posAfterMove(move) == (0,0):
						probSum = 10
					else:
						probSum = 0
						for nextState,prob in world.nextPreyStates():
							probSum += prob*discount*value[nextState]
					curMax = max(curMax,probSum)
				value[state] = curMax
				delta = max(delta,abs(old - curMax))
		value[(0,0)] = 10
		self.value 		  = value
		self.actionList   = []
		self.allList  = []
		self.bottomPolicy = policy
		self.discount     = discount

Example #2

Show file

File: MCon.py Project: HarrieO/Autonomous-Agents

def MCon(episodes, initValue=15,epsilon=0.1, alpha=0.5,discount=0.9):
	# world object, (starting state is trivial)
	world = World((0,0),(1,1))

	# initialize Q value table and Return list for every (s,a)-pair
	Q = {}
	R = {}
	for state in world.allStates():
		for move in world.moveList():
			Q[state,move] = initValue # some value
			R[state,move] = [] # empty list; return = cummulative discounted reward
	steps = [0]*episodes # list counting number of iterations

	for i in range(episodes):
		iterations = 0
		# initialize world
		world.setState((-5,-5))
		stateActionPairs = {}
		# generate an episode using current policy
		while True:
			state = world.position
			# move the predator according to policy
			action = epsGreedyPolicy(state, world, Q, epsilon)
			world.move(action)
			if not (state,action) in stateActionPairs: # store first occurence
				stateActionPairs[(state,action)] = iterations # will be used for discounting
			iterations += 1
			# check if predator caught the prey
			if world.stopState():
				break
			# move the prey (stochasticly)
			world.performPreyMove()
			newState = world.position
		steps[i] = iterations # save amount of iterations needed to catch the prey
		# update Q and R
		for pair in stateActionPairs.keys():
			firstReturn = 10.0*discount**(iterations-stateActionPairs[pair]) # always zero but 10 when episode ends
			R[pair].append(firstReturn)
			Q[pair] = np.mean(np.array(R[pair]))
		# update policy done in epsilon greedy policy code
	return steps

Example #3

Show file

File: Qlearning.py Project: HarrieO/Autonomous-Agents

def Qlearning(episodes, policy, startState=(-5,-5), initValue=15,policyParam=0.1, alpha=0.4,discount=0.9):
	# world object, (starting state is trivial)
	world = World((0,0),(1,1))

	# Q value table
	Q = {}
	for state in world.allStates():
		for move in world.moveList():
			Q[state,move] = initValue

	steps = [0]*episodes

	for i in range(episodes):
		iterations = 0
		# initialize world
		world.setState(startState)
		while True:
			state = world.position
			# move the predator according to policy with one parameter (epsilon for E-greedy or Tua for softmax)
			action = policy(state, world, Q, policyParam)
			world.move(action)
			iterations += 1
			# check if predator caught the prey
			if world.stopState():
				# the Q(s,a) update rule (note that the next state is the absorbing state)
				Q[state,action] = Q[state,action] + alpha * (10 - Q[state,action])
				break
			# move the prey (stochasticly)
			world.performPreyMove()
			newState = world.position
			# the maximum value the agent can have after another move
			maxQ = max([Q[newState,nextAction] for nextAction in world.moveList()])
			# the Q(s,a) update rule (note that the immediate reward is zero)
			Q[state,action] = Q[state,action] + alpha * ( discount*maxQ - Q[state,action])
		# print the number of steps the predator took
		steps[i] = iterations
	return steps

Example #4

Show file

File: minimax.py Project: HarrieO/Autonomous-Agents

def minimax(episodes,initial_state,epsilon, decay, gamma, alpha_pred=1.0, alpha_prey=1.0):
    # initialization might be too expansive
    Q_pred = dict()
    Q_prey = dict()
    V_pred = dict()
    V_prey = dict()
    pi_pred = dict()
    pi_prey = dict()
    initValue = 1.0
    # initialisation
    world = World((5,5),initial_state)
    for state in world.allStates():
      V_pred[state] = 1.0
      V_prey[state] = 1.0
      for action in world.allMoveList():
          pi_pred[(state,action)]=1.0/len(world.allMoveList())
          for prey_move in world.singleMoveList():
              Q_pred[(state, action, prey_move)]=1.0
              Q_prey[(state, action, prey_move)]=1.0
      for action in world.singleMoveList():
          pi_prey[(state,action)]=1.0/len(world.singleMoveList())
    # absorbing states
    terminal_state = tuple([(0,0)] * len(initial_state))
    V_pred[terminal_state] = 0.0
    V_prey[terminal_state] = 0.0

    steps = [0]*episodes
    rewards = [0]*episodes
    for epi in range(episodes):
        
        # initialize world
        world = World((5,5),initial_state)

        # print "Begin Pred", V_pred[world.position]
        # print "End   Prey", V_prey[world.position]
        # for s in world.singleMoveList():
        #     print s, "Pred", V_pred[(s,)]
        #     print s, "Prey", V_pred[(s,)]
        #     for a in world.allMoveList():
        #         for a2 in world.singleMoveList():
        #             print s, "Q", a, a2, Q_pred[(state,a,a2)]

        iterations =0
        while not world.stopState():
            state = world.position
            # choose action
            action_pred = minimax_policy(epsilon, pi_pred, state, world.allMoveList())
            action_prey = minimax_policy(epsilon, pi_prey, state, world.singleMoveList())
            
            reward = world.move(action_prey,action_pred)
            iterations +=1
            new_state = world.position

            # update Q
            # if (state,action_prey) not in Q_prey:
            #     Q_prey[state,action_prey] = initValue
            # if (state,action_pred) not in Q_pred:
            #     Q_pred[state,action_pred] = initValue 
            Q_pred[(state,action_pred,action_prey)] = (1.0-alpha_pred)*Q_pred[(state,action_pred,action_prey)] + alpha_pred*(reward[1]+ gamma* V_pred[new_state])
            Q_prey[(state,action_pred,action_prey)] = (1.0-alpha_prey)*Q_prey[(state,action_pred,action_prey)] + alpha_prey*(reward[0]+ gamma* V_prey[new_state])

            # update pi
            # adapted from example: http://abel.ee.ucla.edu/cvxopt/examples/tutorial/lp.html

            ##  PREDATOR update
            # constraint to minimize w.r.t. prey action
            minConstr   = [[1.0] + [-Q_pred[(state,a_pred,a_prey)] for a_pred in world.allMoveList()] for a_prey in world.singleMoveList()]
            # constrinat to keep every pi(a) positive
            posConstr   = []
            for i in range(1,len(world.allMoveList())+1):
                new_row    = [0.0] * (len(world.allMoveList())+1)
                new_row[i] = -1.0
                posConstr.append(new_row)

            normGreater = [0.0] + [1.0] * len(world.allMoveList())
            normSmaller = [0.0] + [-1.0] * len(world.allMoveList())

            A = matrix([normGreater, normSmaller] + minConstr + posConstr).trans()
            b = matrix([ 1.0, -1.0] + [0.0] * (len(world.singleMoveList()) + len(world.allMoveList())) )
            # -1 V and 0 for all pi(s,a)
            c = matrix([ -1.0 ] + [0.0] * len(world.allMoveList()))

            sol=solvers.lp(c,A,b)

            V_pred[state] = sol['x'][0]
            for a_pred, x in zip(world.allMoveList(),sol['x'][1:]):
                pi_pred[(state,a_pred)] = x

            # ## PREY update
            # constraint to minimize w.r.t. prey action
            minConstr   = [[1.0] + [-Q_prey[(state,a_pred,a_prey)] for a_prey in world.singleMoveList()] for a_pred in world.allMoveList()]
            # # constriant to keep every pi(a) positive
            posConstr   = []
            for i in range(1,len(world.singleMoveList())+1):
                new_row    = [0.0] * (len(world.singleMoveList())+1)
                new_row[i] = -1.0
                posConstr.append(new_row)

            normGreater = [0.0] + [ 1.0] * len(world.singleMoveList())
            normSmaller = [0.0] + [-1.0] * len(world.singleMoveList())

            A = matrix([normGreater, normSmaller] + minConstr + posConstr).trans()
            b = matrix([ 1.0, -1.0] + [0.0] * (len(world.allMoveList()) + len(world.singleMoveList())) )
            # -1 V and 0 for all pi(s,a)
            c = matrix([ -1.0 ] + [0.0] * len(world.singleMoveList()))

            sol=solvers.lp(c,A,b)
        
            V_prey[state] = sol['x'][0]
            for a_prey, x in zip(world.singleMoveList(),sol['x'][1:]):
                pi_prey[(state,a_prey)] = x


            alpha_pred *= decay
            alpha_prey *= decay
        if epi > 0 and epi % 50 == 0:
            print "Episode",epi
        steps[epi]   = iterations
        if reward[1] > 0:
            rewards[epi] = 1
    return steps, rewards

Example #5

Show file

File: MCoff.py Project: HarrieO/Autonomous-Agents

def MCoff(episodes, behaPolicy, matches=[], initValue=15,discount=0.9):
	# behaPolicy = dictionary with keys (state,action) and value P(action|state)


	world = World((0,0),(1,1))
	movelist = world.moveList()
	def policy(world):
		return world.pickElementWithProbs([(move,behaPolicy[(world.position,move)]) for move in movelist])

	# initialize Q value table and Return list for every (s,a)-pair
	Q = {}
	R = {}
	num = {}
	denum = {} 
	for state in world.allStates():
		for move in world.moveList():
			num[state,move] = 0.0
			denum[state,move] = 0.0
			Q[state,move] = float(initValue) # some value
			R[state,move] = [] # empty list; return = cummulative discounted reward
	steps = [0]*episodes # list counting number of iterations
	for epi in range(episodes):
		time = 0
		totalTime =0
		# initialize world
		world.setState((-5,-5))
		
		episode = []
		while True:
			action = policy(world)
			episode.append((world.position, action))
			if action == None:
				print action, state
			world.move(action)
			if world.stopState():
				break
			world.performPreyMove()

		# save the pairs that match, and their first occurence
		matchingHistory = {}
		# last time move was equal to policy
		last = 0
		for i, (state, action) in enumerate(episode[::-1]):
			actionValues = [(Q[state,maction],maction) for maction in world.moveList()]
			bestActions = [actionValues[j][1] for j in maxIndices(actionValues)]
			matchingHistory[(state, action)] = len(episode)-i - 1
			if action not in bestActions:
				last = len(episode)-i
				break
			
		
		matches.append(len(episode)-last)
		
		for (state, action) in matchingHistory:
			if matchingHistory[(state, action)] >= last-1:
				w = np.prod([ 1.0/behaPolicy[episode[j]] for j in range(matchingHistory[(state, action)],len(episode))])
				num[(state,move)]   += w * (10.0*discount**matchingHistory[(state, action)]) # return is gamma^{T-t}*10
				denum[(state,move)] += w
				Q[(state,move)]= num[(state,move)]/float(denum[(state,move)])

		world.setState((-5,-5))
		iterations = 0
		while True:
			iterations += 1
			actionValues = [(maction, Q[state,maction]) for maction in world.moveList()]
			bestAction = random.choice([actionValues[j][0] for j in maxIndices(actionValues)])
			world.move(bestAction)
			if world.stopState() or iterations > 2000:
				break
			world.performPreyMove()
		steps[epi] = iterations
		
			
	return steps