import util as ut # need to generate MDP # P : S x SA, column stochastic N = 3; # columns M = 5; # rows S = N*M; A = 4; p = 0.7; SMin = []; SMax = []; for i in range(S): if i%3 == 0: SMin.append(i); else: SMax.append(i); P = ut.rectangleMDP(M,N,p); C = np.random.rand(N*M,A)*100.; #-------------------------- value iteration -----------------------------------# avgCost = pI.game_VI(P,C, SMin, SMax); #------------ checking with cvx -----------------------------------# y = cvx.Variable((S,A)); constraints = []; ones = np.ones(A); for i in range(S): constraints.append(ones*(y[i,:]) == sum([sum([P[i,s*A +a]*y[s,a] for s in range(S)]) for a in range(A)])); constraints.append(y >= 0);
Created on Sat Nov 2 09:48:50 2019 @author: sarahli """ import util as ut import numpy as np import matplotlib.pyplot as plt import dynamicProg as dp import cvxpy as cvx plt.close('all') N = 3; M = 3; S = N*M; A = 4; gamma = 0.5; P = ut.rectangleMDP(N,M,0.7); """ Cost model: player x: C = C1 + C2.dot(y) player y: C = C1 + C2.dot(x) """ C1 = np.random.rand(S,A); C2 = 0.3*np.random.rand(S,A); T = 100; Samples = 10; timeLine = np.arange(0,T); Vx = np.zeros((S,T,Samples)); Vy_varyingGamma = np.zeros((S,T,Samples));
import matplotlib.pyplot as plt import numpy as np import dynamicProg as dP plt.close('all') N = 2 M = 2 S = N * M A = 4 gamma = 0.4 alpha = 1.0 #step size of algorithm eps = 0.2 # for the epsilon greedy algorithm stateVec = np.linspace(0, S, S, endpoint=False) P = ut.rectangleMDP(M, N, 0.7) C = np.random.rand(S, A) #print (C); # generate random list #SARSA implementation T = 100000 Q = np.zeros((S, A, T)) s = np.random.randint(0, S) curA = np.random.randint(0, A) for t in range(T - 1): alpha = 1. / (t + 1) # transition transition = P[:, s * A + curA] nextS = int(np.random.choice(stateVec, 1, p=transition)[0])
# -*- coding: utf-8 -*- """ Created on Sat Jan 4 17:17:25 2020 @author: craba """ import util as ut import numpy as np import dynamicProg as dp row = 5; col = 3; A = 4; P = ut.rectangleMDP(row, col, p = 0.6); C = np.random.rand(row*col, A); gamma = 0.7; print ("----------------Value iteration ---------------"); v_VI = dp.discounted_valueIteration(P,C, True, gamma); print ("value function = ", v_VI); print ("----------------Policy iteration ---------------"); pi_PI, v_PI = dp.policyIteration(P,C, gamma); print ("value function = ", v_PI);