def learn(alpha, eps, numTrainingEpisodes):
    returnSum = 0.0
    for episodeNum in range(numTrainingEpisodes):
        G = 0
        S = mountaincar.init()
        R, S = mountaincar.sample(S, 1)
        G += R
        while (S):
            Q = Q1[S, :] + Q2[S, :]
            prob1 = np.random.random()
            if prob1 < eps:
                # explore
                A = np.random.choice([0, 1])
            else:
                # greedy
                A = Q.argmax()

            R, S_prime = mountaincar.sample(S, A)
            G += R
            S_prime = int(S_prime)

            prob2 = np.random.choice([1, 2])
            if prob2 == 1:
                Q1[S, A] = Q1[S, A] + alpha * (
                    R + GAMMA * Q2[S_prime, (Q1[S_prime]).argmax()] - Q1[S, A])
            else:
                Q2[S, A] = Q2[S, A] + alpha * (
                    R + GAMMA * Q1[S_prime, (Q2[S_prime]).argmax()] - Q2[S, A])

            S = S_prime
        #print("Episode: ", episodeNum, "Return: ", G)
        returnSum = returnSum + G
Beispiel #2
0
def updatePlot(i):
	global prototypePlot, steps, S, x, y, s, gotOut

	if gotOut:
		speed = 1
	else:
		speed = 1

	for i in range(speed):
		res = loop()

		if steps >= 5000 or res:
			S = mountaincar.init()
			print('init mc')
			steps = 0
			acrl.Erase_Traces()
			break

	s = [50]*n +[200] # reset sizes

	for index in get_features(S, False): # change sizes of the close prototypes
		s[index] = 100

	prototypePlot._sizes = s

	x[-1], y[-1] = S[0], S[1] # update the state
	prototypePlot.set_offsets(zip(x,y)) # display the points
	values = clamp(acrl.w + [acrl.value], 0, 1)
	prototypePlot.set_array(values) # display the values

	return prototypePlot,	
Beispiel #3
0
def learnEpisode(alpha, eps, gamma, theta1, theta2):
        in1, in2 = mountaincar.init()
        currentStates = tilecode(in1, in2, [-1]*numTilings) # returns the initial state
        episodeReturn = 0
        step = 0
        while(True): # continue until we reach terminal state (None)
            action = epsGreedyPolicy(currentStates, eps, theta1, theta2)
            reward, nextStatePosVel = mountaincar.sample((in1, in2), action)
            episodeReturn += reward
            step += 1
            if nextStatePosVel:
                nextIn1, nextIn2 = nextStatePosVel
                nextStates = tilecode(nextIn1, nextIn2, [-1]*numTilings)
                if(np.random.randint(0,2)):  # will return ints between [0,2)
                    updateTheta(theta1, theta2, currentStates, nextStates, action, reward, alpha, gamma)
                else:
                    updateTheta(theta2, theta1, currentStates, nextStates, action, reward, alpha, gamma)
                currentStates = nextStates
                in1, in2 = nextIn1, nextIn2
            else: # next state is terminal state
                if(np.random.randint(0,2)):  # will return ints between [0,2)
                    updateTheta(theta1, theta2, currentStates, nextStates, action, reward, alpha, gamma)
                else:
                    updateTheta(theta2, theta1, currentStates, nextStates, action, reward, alpha, gamma)
                return episodeReturn, step
Beispiel #4
0
def updatePlot(i):
    global prototypePlot, steps, S, x, y, s, gotOut

    if gotOut:
        speed = 1
    else:
        speed = 1

    for i in range(speed):
        res = loop()

        if steps >= 5000 or res:
            S = mountaincar.init()
            print('init mc')
            steps = 0
            acrl.Erase_Traces()
            break

    s = [50] * n + [200]  # reset sizes

    for index in get_features(S,
                              False):  # change sizes of the close prototypes
        s[index] = 100

    prototypePlot._sizes = s

    x[-1], y[-1] = S[0], S[1]  # update the state
    prototypePlot.set_offsets(zip(x, y))  # display the points
    values = clamp(acrl.w + [acrl.value], 0, 1)
    prototypePlot.set_array(values)  # display the values

    return prototypePlot,
Beispiel #5
0
def learn(alpha=0.1 / numTilings, epsilon=0.0, numEpisodes=200):
    theta1 = -0.001 * rand(n)
    theta2 = -0.001 * rand(n)
    returnSum = 0.0
    for episodeNum in range(numEpisodes):
        G = 0.0
        tileIndices = [-1] * numTilings
        pos, vel = mountaincar.init()
        state = (pos, vel)
        step = 0
        while state != None:
            tilecode(pos, vel, tileIndices)
            action = chooseaction(state, theta1, theta2)
            r, nstate = mountaincar.sample(state, action)
            tileIndices = [-1] * numTilings
            if nstate != None:
                if randint(0, 2) == 0:
                    naction = chooseaction(nstate, theta1, theta2)
                    tileIndices = tilecode(state[0], state[1], tileIndices)
                    for i in range(numTilings):
                        theta1[tileIndices[i] +
                               (action * numTiles)] += alpha * (
                                   r + Total(nstate, naction, theta2) -
                                   Total(state, action, theta1))
                else:
                    naction = chooseaction(nstate, theta1, theta2)
                    tileIndices = tilecode(state[0], state[1], tileIndices)
                    for i in range(numTilings):
                        theta2[tileIndices[i] +
                               (action * numTiles)] += alpha * (
                                   r + Total(nstate, naction, theta1) -
                                   Total(state, action, theta2))
            else:
                if randint(0, 2) == 0:
                    tileIndices = tilecode(state[0], state[1], tileIndices)
                    for i in range(numTilings):
                        theta1[tileIndices[i] +
                               (action * numTiles)] += alpha * (
                                   r - Total(state, action, theta1))
                else:
                    tileIndices = tilecode(state[0], state[1], tileIndices)
                    for i in range(numTilings):
                        theta2[tileIndices[i] +
                               (action * numTiles)] += alpha * (
                                   r - Total(state, action, theta2))
            state = nstate
            G += r
            step += 1
        #print("Episode:", episodeNum, "Steps:", step, "Return: ", G)
        avgrlist[episodeNum] += G
        avgslist[episodeNum] += step
        returnSum += G
    #print("Average return:", returnSum / numEpisodes)

    return returnSum, theta1, theta2, step
Beispiel #6
0
def learn():    
    runSum = 0.0
    for run in xrange(numRuns):
        theta = -0.01*rand(n)
        returnSum = 0.0
        for episodeNum in xrange(numEpisodes):
            step = 0
            G = 0        
            traces = zeros(n)
            S=mountaincar.init()
            # Until S is terminal:
            while S!=None:
                # Choose action
                tilecode(S,F)
                if rand() <= Emu:                 # randomly explore
                    a = randint(0, 2)
                else:                             # greedy action choice
                    a = argmax([QValue(F,0,theta),QValue(F,1,theta),QValue(F,2,theta)])
                # Replacing traces on indices where feature vector is 1
                for index in F:
                    traces[index+(a*numTiles)] = 1                     
                # Take action, observe r,Sp
                r,Sp=mountaincar.sample(S,a)
                G += r
                # If terminal action update theta and end episode
                if Sp == None:
                    delta = r - QValue(F,a,theta)
                    theta =  theta + alpha*delta*traces
                    break
                # Choose expected next action
                tilecode(Sp,Fp)
                ap = argmax([QValue(Fp,0,theta),QValue(Fp,1,theta),QValue(Fp,2,theta)])
                # Update theta
                randomAction = (Epi/3)*QValue(Fp,0,theta) + (Epi/3)*QValue(Fp,1,theta)+ (Epi/3)*QValue(Fp,2,theta)
                delta = r + randomAction + (1-Epi)*QValue(Fp,ap,theta) - QValue(F,a,theta)
                theta = theta + alpha*delta*traces
                # Decay every component
                traces = gamma*lmbda*traces
                S=Sp
                step += 1
            returnSum += G        
    
            print "Episode: ", episodeNum, "Steps:", step, "Return: ", G
            episodeReturn[episodeNum] += (G-episodeReturn[episodeNum])/(numRuns+1)
            episodeSteps[episodeNum] += (step-episodeSteps[episodeNum])/(numRuns+1)
            returnSum = returnSum + G
        print "Average return:", returnSum/numEpisodes
        runSum += returnSum
    print "Overall performance: Average sum of return per run:", runSum/numRuns
    writeAverages(episodeReturn,episodeSteps)
def evaluate(numEvaluationEpisodes):
    returnSum = 0.0
    for episodeNum in range(numEvaluationEpisodes):
        G = 0
        S = mountaincar.init()
        R, S = mountaincar.sample(S, 1)
        G += R
        while (S):
            Q = Q1[S, :] + Q2[S, :]
            A = Q.argmax()
            R, S = mountaincar.sample(S, A)
            G += R

        returnSum = returnSum + G
    return returnSum / numEvaluationEpisodes
Beispiel #8
0
 def episode(self, discount=1.0, max_steps=1e3):
   """ Run n-step Q(sigma) for one episode """
   self._s = mountaincar.init()
   self._r_sum = 0.0
   self._time = 0 # step counter
   self._T = float('inf')
   self._tau = 0
   action = self.pick_action(self._s)
   self._tr = [(self._s, self._r_sum)] * self._n
   self._delta = [0.0] * self._n
   self._Qt = [self._Q[self._s, action]] * (self._n + 1)
   self._pi = [0.0] * self._n
   self._sigma = [0.0] * self._n
   while (self._tau != (self._T - 1)) and (self._time < max_steps):
     action = self.act(action, discount)
   self._sig *= self._beta
   return self._r_sum
Beispiel #9
0
def learn(alpha=0.1 / numTilings, epsilon=0.0, numEpisodes=200):
    theta1 = -0.001 * rand(n)
    theta2 = -0.001 * rand(n)
    returnSum = 0.0
    for episodeNum in range(numEpisodes):
        G = 0.0
        #your code goes here (20-30 lines, depending on modularity)
        state = mountaincar.init()
        #q1 = [0] * 3 # state-action value q for each
        #q2 = [0] * 3
        #feature_vectors = np.zeros(n)

        while state != None:
            tileIndices = [-1]*numTilings
            tilecode(s[0], s[1], tileIndices) # s[0]:position s[1]:velocity
            q0 = Qs(theta1, tileIndices) + Qs(theta2, tileIndices) # if action is 0
            q1 = Qs(theta1, tileIndices+numTiles) + Qs(theta2, tileIndices+numTiles) #if action is 1
            q2 = Qs(theta1, tileIndices+numTiles*2) + Qs(theta2, tileIndices+numTiles*2) # if action is 2
            Q = np.array([q0, q1, q2])

            # apply epsilon greedy to choose actions
            greedy = np.random.random()
            if(greedy >= epsilon):
                action = Q.argmax()
            else:
                action = np.random.randint(0,3)

            reward, nextS = mountaincar.sample(state, action)
            G = G + reward

            while nextS == None: # if next state is terminal state




        print("Episode:", episodeNum, "Steps:", step, "Return: ", G)
        returnSum += G
    print("Average return:", returnSum / numEpisodes)
    return returnSum, theta1, theta2
Beispiel #10
0
def learn(alpha=.1/numTilings, epsilon=0, numEpisodes=1000, numRuns=1):

    returnSum = 0.0
    avgEpisodeReturns = [0]*numEpisodes
    doubleQ = DoubleQ(alpha, epsilon)

    for run in range(numRuns):
        doubleQ.resetQ()
        for episodeNum in range(numEpisodes):
            print("Run: " + str(run) + ", Episode: " + str(episodeNum) + " ....")
            G = 0
            isTerminal = False
            #initialize the mountain car
            stateTuple = mountaincar.init()
            state = tilecode(stateTuple[0], stateTuple[1])

            while (not isTerminal):
                action = doubleQ.policy(state)
                reward, stateTuple = mountaincar.sample(stateTuple, action)
                G+=reward
                if stateTuple:
                    nextState = tilecode(stateTuple[0], stateTuple[1])
                else:
                    nextState = None
                    
                doubleQ.learn(state, action, nextState, reward)           

                if not stateTuple:
                    isTerminal = True
                else:
                    state = nextState

            print("Run: ",  run+1, " Episode: ", episodeNum, " Steps:", step, " Return: ", G)
            returnSum = returnSum + G
            avgEpisodeReturns[episodeNum] = avgEpisodeReturns[episodeNum] +  (1/(run+1))*(G - avgEpisodeReturns[episodeNum])

    return avgEpisodeReturns, doubleQ.theta1, doubleQ.theta2
def test_params(_lmbda, _alpha, _epsilon):
	global theta, e
	Epi = Emu = _epsilon
	alpha = _alpha
	lmbda = _lmbda
	runSum = 0.0
	for run in xrange(numRuns):
		e = np.zeros(numTilings*n*3)
		theta = -0.01*np.random.random_sample(numTilings*n*3)
		returnSum = 0.0
		for episodeNum in xrange(numEpisodes):
		    G = 0
		    S = mountaincar.init()
		    step = 0
		    while(S!=None):
		        step+=1
		        A = epsilon_greedy_policy(S)
		        R, S_next = mountaincar.sample(S,A)
		        G+=R
		        #since value of terminal state is 0 by definition
		        #computation for delta is simplified
		        if(S_next==None):
		            delta = R - q(S,A)
		        else:
		            delta = R+Epi*np.average([q(S_next,a) for a in [0,1,2]]) +\
		                (1-Epi)*np.max([q(S_next,a) for a in [0,1,2]]) - q(S,A)
		        e*=gamma*lmbda
		        tilecode(S[0], S[1], F)
		        for index in [i+A*numTilings*n for i in F]:
		            e[index] = 1
		        theta +=alpha*delta*e
		        S=S_next
		        if(step >10000): return -10000000000
		    returnSum = returnSum + G
		runSum += returnSum
	return runSum/numRuns
F = [-1] * numTilings
Q = [0] * 3
numActions = 3
returns = np.zeros([numRuns,numEpisodes])
stepList = np.zeros([numRuns,numEpisodes])
runList = np.zeros(numRuns)

runSum = 0.0
for run in xrange(numRuns):
    theta = -1*ones([numTiles,3]) #*rand(numTiles,3)
    returnSum = 0.0
    for episodeNum in xrange(numEpisodes):
        G = 0
        step = 0
        e = np.zeros([numTiles,3])
        (position, velocity) = mountaincar.init()
        while 1: 
            tilecode(position, velocity, F)
            Q = np.sum(theta[F],axis=0) 

            if np.random.random() > epsilon:
                A = np.argmax(Q)
            else:
                A = np.random.randint(numActions)
     
            R, result = mountaincar.sample((position, velocity), A)
            error = R - Q[A]
            eOld = copy.copy(e)
            e[F,A] = 1
            G += R
            if result == None:
Beispiel #13
0
Epi = Emu = epsilon = 0
n = numTilings*numTiles*numActions
F = [-1]*np.ones(numTilings)
steps=np.zeros(numEpisodes)
returns=np.zeros(numEpisodes)

runSum = 0.0
for run in xrange(numRuns):
	theta = -0.01*rand(n)
	returnSum = 0.0
	for episodeNum in xrange(numEpisodes):
		G = 0	
#	your code goes here (20-30 lines, depending on modularity)
		step=0
		e=np.zeros(n)
		s=mc.init()
		Q=np.zeros(numActions)
		while s!=None:
			step=step+1
			tilecode(s[0],s[1],F)
			Q=np.zeros(numActions)
			for a in range(3):
				for _ in F:
					Q[a]=Q[a]+theta[_+a*324]
			a=np.argmax(Q)
			r, s1=mc.sample(s,a)
			G+=r
			delta=r-Q[a]
			for i in F:
				e[i+a*324]=1
			if s1==None:
Beispiel #14
0
def learn(alpha=.1 / numTilings, epsilon=0, numEpisodes=200):
    theta1 = -0.001 * rand(n)
    theta2 = -0.001 * rand(n)
    returnSum = 0.0
    runEpisodeReturns = []

    for episodeNum in range(numEpisodes):
        G = 0
        step = 0
        currentState = mountaincar.init()
        terminate = False
        while not terminate:
            action = argmax([
                qHat(currentState, 0, theta1) + qHat(currentState, 0, theta2),
                qHat(currentState, 1, theta1) + qHat(currentState, 1, theta2),
                qHat(currentState, 2, theta1) + qHat(currentState, 2, theta2)
            ])
            R, nextState = mountaincar.sample(currentState, action)
            if (nextState is None):

                if randint(0, 2) == 0:  # 0.5 probability
                    phi = tilecode(currentState[0], currentState[1])
                    for i in range(numTilings):
                        theta1[phi[i] + (action * numTiles)] += alpha * (
                            R - qHat(currentState, action, theta1))

                else:  # 0.5 probability
                    phi = tilecode(currentState[0], currentState[1])
                    for i in range(numTilings):
                        theta2[phi[i] + (action * numTiles)] += alpha * (
                            R - qHat(currentState, action, theta2))
                terminate = True

            else:

                if randint(0, 2) == 0:  #0.5 probability
                    nextAction = argmax([
                        qHat(nextState, 0, theta1),
                        qHat(nextState, 1, theta1),
                        qHat(nextState, 2, theta1)
                    ])
                    phi = tilecode(currentState[0], currentState[1])
                    for i in range(numTilings):
                        theta1[phi[i] + (action * numTiles)] += alpha * (
                            R + qHat(nextState, nextAction, theta2) -
                            qHat(currentState, action, theta1))
                else:  #0.5 probability
                    nextAction = argmax([
                        qHat(nextState, 0, theta2),
                        qHat(nextState, 1, theta2),
                        qHat(nextState, 2, theta2)
                    ])
                    phi = tilecode(currentState[0], currentState[1])
                    for i in range(numTilings):
                        theta2[phi[i] + (action * numTiles)] += alpha * (
                            R + qHat(nextState, nextAction, theta1) -
                            qHat(currentState, action, theta2))
                currentState = nextState

            #print("Episode: ", episodeNum, "Return: ", G)
            G = G + R
            step += 1
        runEpisodeReturns.append(G)
        # print("Episode: ", episodeNum, "Steps:", step, "Return: ", G)
        returnSum = returnSum + G

    #print("Average return:", returnSum / numEpisodes)
    return returnSum, theta1, theta2, runEpisodeReturns
Beispiel #15
0
def learn():
    runSum = 0.0
    for run in xrange(numRuns):
        theta = -0.01 * rand(n)
        returnSum = 0.0
        for episodeNum in xrange(numEpisodes):
            step = 0
            G = 0
            traces = zeros(n)
            S = mountaincar.init()
            # Until S is terminal:
            while S != None:
                # Choose action
                tilecode(S, F)
                if rand() <= Emu:  # randomly explore
                    a = randint(0, 2)
                else:  # greedy action choice
                    a = argmax([
                        QValue(F, 0, theta),
                        QValue(F, 1, theta),
                        QValue(F, 2, theta)
                    ])
                # Replacing traces on indices where feature vector is 1
                for index in F:
                    traces[index + (a * numTiles)] = 1
                # Take action, observe r,Sp
                r, Sp = mountaincar.sample(S, a)
                G += r
                # If terminal action update theta and end episode
                if Sp == None:
                    delta = r - QValue(F, a, theta)
                    theta = theta + alpha * delta * traces
                    break
                # Choose expected next action
                tilecode(Sp, Fp)
                ap = argmax([
                    QValue(Fp, 0, theta),
                    QValue(Fp, 1, theta),
                    QValue(Fp, 2, theta)
                ])
                # Update theta
                randomAction = (Epi / 3) * QValue(
                    Fp, 0, theta) + (Epi / 3) * QValue(
                        Fp, 1, theta) + (Epi / 3) * QValue(Fp, 2, theta)
                delta = r + randomAction + (1 - Epi) * QValue(
                    Fp, ap, theta) - QValue(F, a, theta)
                theta = theta + alpha * delta * traces
                # Decay every component
                traces = gamma * lmbda * traces
                S = Sp
                step += 1
            returnSum += G

            print "Episode: ", episodeNum, "Steps:", step, "Return: ", G
            episodeReturn[episodeNum] += (G - episodeReturn[episodeNum]) / (
                numRuns + 1)
            episodeSteps[episodeNum] += (step - episodeSteps[episodeNum]) / (
                numRuns + 1)
            returnSum = returnSum + G
        print "Average return:", returnSum / numEpisodes
        runSum += returnSum
    print "Overall performance: Average sum of return per run:", runSum / numRuns
    writeAverages(episodeReturn, episodeSteps)

# represent actions decelerate, coast, accelerate as integers
for run in range(numRuns):
    w = -0.01*np.random.rand(n)
    returnSum = 0.0
    for episodeNum in range(numEpisodes):
        G = 0
        step = 0

        # From Figure 9.9 in Sutton RL 2014
        # n-component eligibility trace vector
        e = np.zeros(n)

        # initialize observation
        observation = mountaincar.init()

        # use function approximation to generate next state
        tilecode(observation[0], observation[1], observation[2], state)

        # compute the Q values for the state and every action
        Q = Qs(state)

        terminal = False
        A = chooseAction(Q)
        unknownObs = observation
        
        if flipped:
            R, observation, terminal = mountaincar.sample(observation, A, terminal, False)
            someRandomAmountOfTime = random.randint(minNumExtraSteps,maxNumExtraSteps)
            for i in range(1, someRandomAmountOfTime):
def trueOnlinePolicyGradient():
	# logging.basicConfig(filename='example.log',level=logging.DEBUG)
	for alpha_v in alpha_v_list:
		alpha_v = alpha_v * 1.0 / num_tilings
		for alpha_pi in alpha_pi_list:
			alpha_pi = alpha_pi * 1.0 / num_tilings
			print 'alpha_v: ', alpha_v, ' alpha_pi: ', alpha_pi

			avg_steps_overall = 0.0
			avg_steps_per_run = np.zeros((num_runs, ))
			avg_steps_per_episode = np.zeros((num_episodes, ))

			start_time = time.clock()
			for current_run in range(num_runs):
				logging.debug("Run #:" + str(current_run))
				# print 'Run #:', current_run
				theta = 0.00001 * np.random.randn(mem_size, num_actions)
				w = 0.00001 * np.random.randn(mem_size, )
				# w_old = np.zeros((mem_size, ))
				v_old = 0.0

				steps_per_episode = np.zeros((num_episodes, ))
				avg_steps = 0.0

				for current_episode in range(num_episodes):

					# if (current_episode+1) % 10 == 0:
					# 	plotWeights(theta, w, current_episode)

					G = 0.0
					step = 0

					z_theta = np.zeros((mem_size, num_actions))
					z_theta_old = np.zeros((mem_size, num_actions))
					z_w = np.zeros((mem_size, ))

					(pos, vel) = mountaincar.init()
					phi = np.zeros((mem_size, ))
					tiled_indices = tilecode(pos, vel)
					phi[tiled_indices] = 1
					current_state = (pos, vel)
					(a_star, PG_star) = sampleAction(theta, phi)

					a_prime = 0
					PG_prime = np.zeros((mem_size, num_actions))

					while current_state is not None and step < max_steps:
						reward, next_state = mountaincar.sample(current_state, a_star)

						G += (gamma * reward)
						step += 1

						v_current = np.dot(w.transpose(), phi)
						v_next = 0.0
						phi_prime = np.zeros((mem_size, ))
						if next_state is not None:
							tiled_indices = tilecode(next_state[0], next_state[1])
							phi_prime[tiled_indices] = 1
							v_next = np.dot(w.transpose(), phi_prime)
							(a_prime, PG_prime) = sampleAction(theta, phi_prime)
						delta = reward + (gamma * v_next) - v_current

						# z_w = (gamma * lmbda * z_w) + phi - (alpha_v * gamma * lmbda * np.dot(z_w.transpose(), phi) * phi)
						# w += (alpha_v * (delta + v_current - v_old) * z_w - alpha_v * (v_current - v_old) * phi)

						z_w = (gamma * lmbda * z_w) + phi
						w += (alpha_v * delta * z_w)

						# z_theta = (gamma * lmbda * z_theta) + PG_star
						# theta += ((alpha_pi * z_theta * delta) + ((alpha_pi * z_theta_old) * (v_current - v_old)))

						z_theta = (gamma * lmbda * z_theta) + PG_star
						theta += (alpha_pi * delta * z_theta)

						v_old = v_next
						z_theta_old = np.copy(z_theta)
						phi = np.copy(phi_prime)
						a_star = a_prime
						current_state = next_state
						PG_star = np.copy(PG_prime)

					# print '########### Episode: ', current_episode, ' Return: ', G, ' Steps: ', step, " Run: ", current_run
					steps_per_episode[current_episode] = step
					avg_steps += step
				avg_steps = avg_steps * 1.0 / num_episodes
				avg_steps_overall += avg_steps
				avg_steps_per_run[current_run] = avg_steps

				avg_factor = 1.0 / (current_run + 1)
				for episode_i in range(num_episodes):
					avg_steps_per_episode[episode_i] *= (1 - avg_factor)
					avg_steps_per_episode[episode_i] += (avg_factor * steps_per_episode[episode_i])

			end_time = time.clock()
			elapsed_time = (end_time - start_time) / 60.0
			print 'Elapsed time: ', elapsed_time
			# logging.debug('Elapsed time: ' + str(elapsed_time))
			avg_steps_overall = avg_steps_overall * 1.0 / num_runs
			std_error = 0.0
			for run_i in range(num_runs):
				avg_factor_run = 1.0 / (run_i + 1)
				std_error = ((1 - avg_factor_run) * std_error) + (avg_factor_run * (avg_steps_per_run[run_i] - avg_steps_overall) * (avg_steps_per_run[run_i] - avg_steps_overall))
			std_error = np.sqrt(std_error * 1.0 / num_runs)

			total_steps = avg_steps_overall * num_episodes * num_runs
			print 'Time per step: ', (elapsed_time * 1.0 / total_steps)
			print 'alpha_v: ', alpha_v, ' alpha_pi: ', alpha_pi, ' lmbda: ', lmbda
			print  'average reward: ', -1.0 * avg_steps_overall, ' std. error: ', std_error
			print 'Policy gradient'
def learn(alpha=.1 / numTilings, epsilon=0, numEpisodes=200):
    theta1 = -0.001 * rand(n)
    theta2 = -0.001 * rand(n)
    returnSum = 0.0
    for episodeNum in range(numEpisodes):
        G = 0
        S = mountaincar.init()
        step = 0
        while (S):
            indexList = [-1] * numTilings
            tilecode(S[0], S[1], indexList)
            indexList = np.array(indexList)
            q0 = qVal(theta1, indexList) + qVal(theta2, indexList)
            q1 = qVal(theta1, indexList + numTiles) + qVal(
                theta2, indexList + numTiles)
            q2 = qVal(theta1, indexList + 2 * numTiles) + qVal(
                theta2, indexList + 2 * numTiles)
            Q = np.array([q0, q1, q2])

            prob1 = np.random.random()
            if prob1 < epsilon:
                # explore
                A = np.random.choice([0, 1, 2])
            else:
                # greedy
                A = Q.argmax()

            R, S_prime = mountaincar.sample(S, A)
            G += R

            prob2 = np.random.choice([1, 2])
            if prob2 == 1:
                theta_n = theta1
                theta_prime = theta2
            else:
                theta_n = theta2
                theta_prime = theta1
            indexList = [x + A * numTiles for x in indexList]
            qval_theta_n = qVal(theta_n, indexList)

            if not S_prime:
                for index in indexList:
                    theta_n[index] = theta_n[index] + alpha * (R -
                                                               qval_theta_n)
                break

            indexList_prime = [-1] * 4
            tilecode(S_prime[0], S_prime[1], indexList_prime)
            indexList_prime = np.array(indexList_prime)

            q0_n = qVal(theta_n, indexList_prime)
            q1_n = qVal(theta_n, indexList_prime + numTiles)
            q2_n = qVal(theta_n, indexList_prime + 2 * numTiles)
            A_prime = np.array([q0_n, q1_n, q2_n]).argmax()
            q_prime_max = qVal(theta_prime,
                               A_prime * numTiles + indexList_prime)

            for index in indexList:
                theta_n[index] = theta_n[index] + alpha * (R + q_prime_max -
                                                           qval_theta_n)

            S = S_prime
            step += 1

        # print("Episode: ", episodeNum, "Steps:", step, "Return: ", G)
        returnSum = returnSum + G
    # print("Average return:", returnSum / numEpisodes)
    return returnSum, theta1, theta2
Beispiel #19
0
def learn(alpha=.1 / numTilings, epsilon=0, numEpisodes=200):
    theta1 = -0.001 * rand(n)
    theta2 = -0.001 * rand(n)
    #Q=zeros(3)

    returnSum = 0.0
    for episodeNum in range(numEpisodes):
        G = 0
        step = 0
        S = mountaincar.init()
        tileindec = tilecode(S[0], S[1], [-1] * numTilings)
        #        Q=Qs(tileindec,theta1)
        #        act=argmax(Q)

        #derivate=zeros(n)

        while S != None:
            step += 1
            #derivate=zeros(n)
            #tileindec=tilecode(S[0],S[1],[-1] * numTilings)
            #Q=Qs(tileindec,theta1)
            if random() < epsilon:
                act = randint(0, 3)
            else:
                act = argmax(Qs(tileindec, theta1 + theta2))

            R, Stemp = mountaincar.sample(S, act)

            G += R
            if Stemp == None:
                pro = randint(0, 2)
                if pro == 1:

                    q1 = Qs(tileindec, theta1)
                    q2 = Qs(tileindec, theta2)
                    update = alpha * (R + q2[argmax(q1)] - q1[act])
                for i in tileindec:
                    theta1[i + act * 324] += update
                break

                if pro == 0:

                    q1 = Qs(tileindec, theta1)
                    q2 = Qs(tileindec, theta2)
                    update = alpha * (R + q2[argmax(q1)] - q1[act])
                for i in tileindec:
                    theta2[i + act * 324] += update
                break
            else:
                tileindec_tem = tilecode(Stemp[0], Stemp[1], [-1] * numTilings)
                #            for i in tileindec:
                #                derivate[i+act*324]=1
                pro = randint(0, 2)
                if pro == 1:
                    if Stemp != None:
                        q1 = Qs(tileindec_tem, theta1)
                        q2 = Qs(tileindec_tem, theta2)
                        update = alpha * (R + q2[argmax(q1)] - q1[act])
                    for i in tileindec:
                        theta1[i + act * 324] += update

                else:
                    if Stemp != None:
                        q1 = Qs(tileindec_tem, theta1)
                        q2 = Qs(tileindec_tem, theta2)

                        update = alpha * (R + q1[argmax(q2)] - q2[act])
                    for i in tileindec:
                        theta2[i + act * 324] += update
                S = Stemp
                tileindec = tileindec_tem


#            for i in tileindec:
#                derivate[i+act*324]=1
#
#            if Stemp==None:
#                #print(Stemp)
#                for i in range(n):
#                    theta1[i]=theta1[i]+alpha*(R-Q[act])*derivate[i]
#                break;
#            else:
#
#                tileindec_tem=tilecode(Stemp[0],Stemp[1],[-1] * numTilings)
#                Q_tem=Qs(tileindec_tem,theta1)
#                #print(Q_tem)
#                act_tem=argmax(Q_tem)
#
#                for i in range(n):
#                    theta1[i]=theta1[i]+alpha*(R+gamma*(Q_tem[act_tem])-Q[act])*derivate[i]
#                S=Stemp
#                #print(S)

#        ...
#        your code goes here (20-30 lines, depending on modularity)
#        ...
        print("Episode: ", episodeNum, "Steps:", step, "Return: ", G)
        returnSum = returnSum + G
    print("Average return:", returnSum / numEpisodes)
    return returnSum, theta1, theta2
    Q = zeros(a)
    for a in range(a):
        for i in F:
            Q[a] += theta[i + a * 324]
    return Q


runSum = 0.0
for run in xrange(numRuns):
    theta = -0.01 * rand(n)
    returnSum = 0.0
    for episodeNum in xrange(numEpisodes):
        G = 0
        #your code goes here (20-30 lines, depending on modularity)
        step = 0.0
        s = mountaincar.init()
        trace = zeros(n)
        Q = zeros(3)
        while s is not None:
            step += 1
            tilecode(s[0], s[1], F)
            Q = Policy(F, 3, theta)
            if rand() <= epsilon:
                action = randint(0, 2)
            else:
                action = argmax(Q)
            r, sp = mountaincar.sample(s, action)
            delta = r - Q[action]
            G += r
            for i in F:
                trace[i + action * numTiles] = 1
Beispiel #21
0
    for a in range(3):
        for i in F:
            Q[a] += theta[i + a * 324]
    return Q


runSum = 0.0
for run in xrange(numRuns):
    theta = -0.01 * numpy.random.rand(n)
    returnSum = 0.0
    for episodeNum in xrange(numEpisodes):
        G = 0
        #your code goes here (20-30 lines, depending on modularity)
        steps = 0
        e = numpy.zeros(n)
        s = mc.init()
        Q = numpy.zeros(numActions)
        while s != None:
            #print Q
            steps += 1
            tilecode(s[0], s[1], F)
            Q = Qs(F)
            a = numpy.argmax(Q)
            r, s1 = mc.sample(s, a)
            G += r
            delta = r - Q[a]
            for i in F:
                e[i + a * 324] = 1
            if s1 == None:
                for i in range(n):
                    theta[i] += alpha * delta * e[i]
Beispiel #22
0
    return q
   
    
returnAvg = zeros(200)
numSteps = zeros(200)    
    
runSum = []
for run in range(numRuns):
    w = -0.01*rand(n)
    returnSum = 0.0
    
    for episodeNum in range(numEpisodes):
        zerovec = zeros(n)
        G = 0
        A = 0
        S = mountaincar.init()
        F = actionTileCode(F,S,A)
        zerovec[F] = 1
        episodeLen = 0
        while(S is not None):
            episodeLen = episodeLen + 1
            RSA = mountaincar.sample(S,A)
            R = RSA[0]
            S = RSA[1]
            G = G + R
            delta = R - sum(w[F])
            q = zeros(3)
            
            if(S is not None):
                for a in range(3):
                    F = actionTileCode(F,S,a)
returns = np.zeros(shape=(numRuns,numEpisodes))   # I add it
	
    
    
# Main function main part
for run in range(numRuns):
    
    #Initializing the weight vec
    w = -0.01*rand(n)
    returnSum = 0.0
    for episodeNum in range(numEpisodes):
        G = 0
        "..."
        "your code goes here (20-30 lines, depending on modularity)"
        S = mountaincar.init()                         #Initialize state
        e = np.zeros(n)                                #Initialize eligibility vector
        steps = 0

        while (True):
            Q = [0, 0, 0]                              #The Q learning (S, A) pair with Feature
            A = 0                                           
            tilecode (S[0], S[1], F)                   #Get the (Position, velocity) and Fearture
            for j in range(3):	
                for i in F:
                    Q[j] = Q[j] + w[i + (j*9*9*4)]     # To compplete one tiling, 4 mapping is needed
	    
                if (random.uniform(0,1) < epsilon):    # Epsilon greedy
                    A = random.choice(actions)
                else: 
                    A = Q.index(max(Q))
#numRuns = 50
#numEpisodes = 200
#averageArray = [(0,0)]*numEpisodes ## tuple ordered (return, steps)
## =======================

for run in xrange(numRuns):
    theta = -0.01*rand(n) 
    returnSum = 0.0
    #stepSum = 0
    print "Run: ", run
    for episodeNum in xrange(numEpisodes):
        eTrace = [0]*n
        G = 0
        delta = 0

        state = mountaincar.init()
        step = 0
        while state != None:
            step += 1

            tiles = tilecode(state[0], state[1],[-1]*numTilings)
            explore = (random.random() < epsilon)

            if explore:
                action = random.randint(0,2)
                reward, newState = mountaincar.sample(state, action)
            else:
                action = getBestAction(tiles, theta)
                reward, newState = mountaincar.sample(state, action)
            G += reward
Beispiel #25
0
def learn(alpha=.1 / numTilings, epsilon=0, numEpisodes=200):
    theta1 = -0.001 * rand(n)
    theta2 = -0.001 * rand(n)
    returnSum = 0.0
    for episodeNum in range(numEpisodes):
        G = 0.0
        S = mountaincar.init()  # S[0] is the position, S[1] is the velocity
        #start = True
        step = 0
        while True:
            #print('$'*80)
            #print('new S: ', S)
            q1 = [0] * 3  # for each possible actions, each has a q value
            q2 = [0] * 3
            phi = [0] * n  # initialize the list of features ø
            tileIndices = [-1] * numTilings
            tilecode(S[0], S[1], tileIndices)
            #print('tileIndices: ', tileIndices)

            # choose action, from a epsilon greedy
            num = np.random.random()
            if (num >= epsilon):
                for possibleAction in range(0, 3):
                    # generate q value for each possible actions
                    for index in tileIndices:  # implementing the vector multiplication thetaT*phi
                        q1[possibleAction] = q1[possibleAction] + theta1[
                            possibleAction * numTiles + index] * 1
                        q2[possibleAction] = q2[possibleAction] + theta2[
                            possibleAction * numTiles + index] * 1
                action = argmax([a + b for a, b in zip(q1, q2)
                                 ])  # choose the greedy action
                #print('action is: ', action)
            else:
                action = np.random.randint(0,
                                           3)  # choose the stochastic action

            #print('action is: ', action)
            # actually generate the features, based on the action
            indices = [action * numTiles + index for index in tileIndices
                       ]  # indicates which position in phi is 1

            # sample the next S, reward
            reward, nextS = mountaincar.sample(S, action)
            #print('nextS:', nextS)
            #print('reward: ',reward)
            G = G + reward
            step += 1
            #print('G:', G)

            if nextS == None:
                # terminal S
                if np.random.randint(0, 2):
                    for i in indices:
                        theta1[i] = theta1[i] + alpha * (reward - q1[action])
                        #G = G+reward
                        #step+=1
                else:
                    for i in indices:
                        theta2[i] = theta2[i] + alpha * (reward - q2[action])
                        #G = G+reward
                        #step+=1
                break
            else:
                # not terminal S
                # need to compute phi for the next S
                nextQ1 = [0] * 3
                nextQ2 = [0] * 3
                #nextPhi = [0]*n
                nextTileIndices = [-1] * numTilings
                tilecode(nextS[0], nextS[1], nextTileIndices)
                #print('nextTileIndices: ', nextTileIndices)

                nextQ1 = Qs(nextTileIndices, theta1)
                nextQ2 = Qs(nextTileIndices, theta2)

                if np.random.randint(0, 2):  # with 0.5 probability
                    nextAction = argmax(nextQ1)
                    for i in indices:
                        theta1[i] = theta1[i] + alpha * (
                            reward + nextQ2[nextAction] - q1[action])

                else:  # with 0.5 probability
                    nextAction = argmax(nextQ2)
                    for i in indices:
                        theta2[i] = theta2[i] + alpha * (
                            reward + nextQ1[nextAction] - q2[action])
                    #print(theta2)

            S = nextS
        steps[episodeNum] = steps[episodeNum] + step
        returns[episodeNum] = returns[episodeNum] + G
        #print("Episode:", episodeNum, "Steps:", step, "Return: ", G)
        returnSum += G
    #print("Average return:", returnSum / numEpisodes)
    return returnSum, theta1, theta2