def opt(e_init, e_target, hrv_hist, hrv_en): # verbosity is how much log is reported back from CPlex. 3 is the most verbose verbosity = 3 m = CPlexModel(verbosity) b = m.new((epochs_per_day, nodes, mod_levels), vtype=int, lb=0, ub=1, name='b') l = m.new((epochs_per_day, nodes, bin_num), vtype=float, lb=-1, ub=battery_cap, name='l') fixed_prob = np.linespace(0, 1, num=bin_num, endpoint=True, dtype=float) e_init_hist = np.zeros(e_init.shape, dtype=float) e_init_hist[:, 0] = 1 hist_rv = np.zeros((epochs_per_day, nodes, bin_num), dtype=float) hist_rv[0] = e_init_hist # prepare the energy vector here en_rv = np.zeros((epochs_per_day, nodes, bin_num), dtype=float) en_rv[0] = e_init for i in xrange(1, epochs_per_day): en_rv[i], hist_rv[i] = next_battery_level(en_rv[i - 1], hist_rv[i - 1],\ hrv_en[i, :, :] - (np.vectorize(energy))(b[i, :]), hrv_hist[i, :, :]) m.constrain(en_rv[i] >= 0) m.constrain(sum(np.vectorize(time)(b[i, :])) <= D) m.maximize(objective_function(en_rv[-1], hist_rv[-1])) return m
def decomposePiLP(S, A, T, s0, terminal, rawX, x, gamma=1): """ DEPRECATED. This tries to decouple a policy into the optimal policy (following no constraints) and another policy \pi'. \pi' may be a dominating policy. Described in Eq. 2 on Aug.29, 2017. """ m = CPlexModel() if not config.VERBOSE: m.setVerbosity(0) # useful constants Sr = range(len(S)) Ar = range(len(A)) y = m.new((len(S), len(A)), lb=0, name='y') sigma = m.new(lb=0, ub=1, name='sigma') for s in Sr: for a in Ar: # note that x and rawX use S x A as domains m.constrain(sigma * rawX[S[s], A[a]] + y[s, a] == x[S[s], A[a]]) # make sure y is a valid occupancy for sp in Sr: # x (x(s) - \gamma * T) = \sigma # and make sure there is no flow back from the terminal states if not terminal(S[sp]): m.constrain(sum(y[s, a] * ((s == sp) - gamma * T(S[s], A[a], S[sp]) * (not terminal(S[s]))) for s in Sr for a in Ar) == (1 - sigma) * (S[sp] == s0)) else: m.constrain(sum(y[s, a] * ((s == sp) - gamma * T(S[s], A[a], S[sp]) * (not terminal(S[sp]))) for s in Sr for a in Ar) == (1 - sigma) * (S[sp] == s0)) obj = m.maximize(sigma) # return sigma and the value of y return obj, {(S[s], A[a]): m[y][s, a] for s in Sr for a in Ar}
def decomposePiLP(S, A, T, s0, terminal, rawX, x, gamma=1): """ This tries to decouple a policy into the optimal policy (following no constraints) and another policy \pi'. \pi' may be a dominating policy. Described in Eq. 2 on Aug.29, 2017. """ m = CPlexModel() if not config.VERBOSE: m.setVerbosity(0) # useful constants Sr = range(len(S)) Ar = range(len(A)) y = m.new((len(S), len(A)), lb=0, name='y') sigma = m.new(lb=0, ub=1, name='sigma') for s in Sr: for a in Ar: # note that x and rawX use S x A as domains m.constrain(sigma * rawX[S[s], A[a]] + y[s, a] == x[S[s], A[a]]) # make sure y is a valid occupancy for sp in Sr: # x (x(s) - \gamma * T) = \sigma # and make sure there is no flow back from the terminal states if not terminal(S[sp]): m.constrain(sum(y[s, a] * ((s == sp) - gamma * T(S[s], A[a], S[sp]) * (not terminal(S[s]))) for s in Sr for a in Ar) == (1 - sigma) * (S[sp] == s0)) else: m.constrain(sum(y[s, a] * ((s == sp) - gamma * T(S[s], A[a], S[sp]) * (not terminal(S[sp]))) for s in Sr for a in Ar) == (1 - sigma) * (S[sp] == s0)) obj = m.maximize(sigma) # return sigma and the value of y return obj, {(S[s], A[a]): m[y][s, a] for s in Sr for a in Ar}
def lpDualCPLEX(mdp, zeroConstraints=(), positiveConstraints=(), positiveConstraintsOcc=1): """ DEPRECATED since we moved to gurobi. but leave the function here for sanity check Solve the dual problem of lp, maybe with some constraints Same arguments Note that this is a lower level function that does not consider feature extraction. r should be a reward function, not a reward parameter. """ S = mdp.S A = mdp.A T = mdp.T r = mdp.r gamma = mdp.gamma alpha = mdp.alpha m = CPlexModel() if not config.VERBOSE: m.setVerbosity(0) # useful constants Sr = range(len(S)) Ar = range(len(A)) x = m.new((len(S), len(A)), lb=0, name='x') # make sure x is a valid occupancy for sp in Sr: # x (x(s) - \gamma * T) = \sigma m.constrain( sum(x[s, a] * ((s == sp) - gamma * T(S[s], A[a], S[sp])) for s in Sr for a in Ar) == alpha(S[sp])) # == constraints if len(zeroConstraints) > 0: m.constrain( sum(x[S.index(s), A.index(a)] for s, a in zeroConstraints) == 0) # >= constraints if len(positiveConstraints) > 0: m.constrain( sum(x[S.index(s), A.index(a)] for s, a in positiveConstraints) >= positiveConstraintsOcc) # obj try: obj = m.maximize(sum([x[s, a] * r(S[s], A[a]) for s in Sr for a in Ar])) except CPlexException as err: print 'Exception', err # we return obj value as None and occ measure as {}. this should be handled correctly return {'feasible': False} return { 'feasible': True, 'obj': obj, 'pi': {(S[s], A[a]): m[x][s, a] for s in Sr for a in Ar} }
def lwaLP(graph_object): capabilites = graph_object.capabilities no_vertices = graph_object.n attacker_strategy = list() m = CPlexModel() cv = m.new(no_vertices, vtype=float, ub=1, lb=0) U = m.new(vtype=float) diag = np.diag(capabilites) m.constrain(U <= -diag * (1 - cv)) m.constrain(sum(cv) <= graph_object.R) m.maximize(U) for i in xrange(m[cv]): if m[cv][i] > 0: attacker_strategy.append(i) start = findStrategySet(m[cv], graph_object.R) return start, attacker_strategy
def domPiMilp(S, A, r, T, s0, terminal, domPis, consIdx, gamma=1): """ Finding dominating policies by representing constraints as possible negative rewards. Described in the report on aug.19, 2017. """ rmax = 10000 M = 0.001 consLen = len(consIdx) m = CPlexModel() if not config.VERBOSE: m.setVerbosity(0) # state range Sr = range(len(S)) # action range Ar = range(len(A)) # decision variables x = m.new((len(S), len(A)), lb=0, name='x') z = m.new(consLen, vtype=bool, name='z') #z = [0, 1, 0] # test for office nav domain t = m.new(name='t') # flow conservation for sp in Sr: # x (x(s) - \gamma * T) = \sigma # and make sure there is no flow back from the terminal states if not terminal(S[sp]): m.constrain(sum(x[s, a] * ((s == sp) - gamma * T(S[s], A[a], S[sp]) * (not terminal(S[s]))) for s in Sr for a in Ar) == (S[sp] == s0)) #print S[sp], [(S[s], A[a]) for s in Sr for a in Ar if T(S[s], A[a], S[sp]) > 0] else: m.constrain(sum(x[s, a] * ((s == sp) - gamma * T(S[s], A[a], S[sp]) * (not terminal(S[sp]))) for s in Sr for a in Ar) == (S[sp] == s0)) # t is the lower bound of the difference between x and y # note: i don't think expressions in constraints can call other functions for y in domPis: # note that y is indexed by elements in S x A, not numbered indices m.constrain(sum(x[s, a] * r(S[s], A[a]) for s in Sr for a in Ar) -\ sum(y[S[s], A[a]] * (r(S[s], A[a]) + sum(- rmax * (S[s][consIdx[i]] != s0[consIdx[i]]) * z[i] for i in range(consLen)))\ for s in Sr for a in Ar)\ >= t) for s in Sr: for i in range(consLen): if S[s][consIdx[i]] != s0[consIdx[i]]: for a in Ar: m.constrain(z[i] + M * x[s, a] <= 1) # obj obj = m.maximize(t) print m[z] return obj, {(S[s], A[a]): m[x][s, a] for s in Sr for a in Ar}
def milp(S, A, R, T, s0, psi, maxV): """ Solve the MILP problem in greedy construction of policy query Args: S: state set A: action set R: reward candidate set T: transition function s0: init state psi: prior belief on rewards maxV: maxV[i] = max_{\pi \in q} V_{r_i}^\pi """ m = CPlexModel() if not config.VERBOSE: m.setVerbosity(0) # useful constants rLen = len(R) M = 10000 # a large number Sr = range(len(S)) Ar = range(len(A)) # decision variables # FIXME i removed upper bound of x. it shoundn't have such bound without transient-state assumption, right? x = m.new((len(S), len(A)), lb=0, name='x') z = m.new(rLen, vtype=bool, name='z') y = m.new(rLen, name='y') # constraints on y m.constrain([y[i] <= sum([x[s, a] * R[i](S[s], A[a]) for s in Sr for a in Ar]) - maxV[i] + (1 - z[i]) * M for i in xrange(rLen)]) m.constrain([y[i] <= z[i] * M for i in xrange(rLen)]) # constraints on x (valid occupancy) for sp in Sr: if S[sp] == s0: m.constrain(sum([x[sp, ap] for ap in Ar]) == 1) else: m.constrain(sum([x[sp, ap] for ap in Ar]) == sum([x[s, a] * T(S[s], A[a], S[sp]) for s in Sr for a in Ar])) # obj obj = m.maximize(sum([psi[i] * y[i] for i in xrange(rLen)])) if config.VERBOSE: print 'obj', obj print 'x', m[x] print 'y', m[y] print 'z', m[z] # build occupancy as S x A -> x[.,.] # z[i] == 1 then this policy is better than maxV on the i-th reward candidate res = util.Counter() for s in Sr: for a in Ar: res[S[s], A[a]] = m[x][s, a] return res
def findUndominatedReward(mdpH, mdpR, newPi, humanPi, localDifferentPis, domPis): """ Implementation of the linear programming problem (Eq.2) in report 12.5 Returns the objective value and a reward function (which is only useful when the obj value is > 0) newPi is \hat{\pi} in the linear programming problem in the report. The robot tries to see if there exists a reward function where newPi is better than the best policy in domPis. """ m = CPlexModel() if not config.VERBOSE: m.setVerbosity(0) S = mdpH.S robotA = mdpR.A humanA = mdpH.A # index of states and actions Sr = range(len(S)) robotAr = range(len(robotA)) humanAr = range(len(humanA)) r = m.new(len(S), lb=0, ub=1, name='r') z = m.new( name='z' ) # when the optimal value is attained, z = \max_{domPi \in domPis} V^{domPi}_r for domPi in domPis: m.constrain(z >= sum( [domPi[S[s], robotA[a]] * r[s] for s in Sr for a in robotAr])) # make sure r is consistent with humanPi for s in S: for a in humanA: # humanPi is better than a locally different policy which takes action a in state a m.constrain(sum(sum((humanPi[S[sp], humanA[ap]] - localDifferentPis[s, a][S[sp], humanA[ap]]) for ap in humanAr)\ * r[sp] for sp in Sr) >= 0) # maxi_r { V^{newPi}_r - \max_{domPi \in domPis} V^{domPi}_r } cplexObj = m.maximize( sum(newPi[S[s], robotA[a]] * r[s] for s in Sr for a in robotAr) - z) obj = sum([newPi[S[s], robotA[a]] * m[r][s] for s in Sr for a in robotAr]) - m[z] # the reward function has the same values for same states, but need to convert back to the S x A space rFunc = lambda s, a: m[r][Sr.index(s)] print 'cplexobj', cplexObj print 'obj', obj print 'newPi' printPi(newPi) print 'z', m[z], 'r', m[r] return obj, rFunc
def findUndominatedReward(mdpH, mdpR, newPi, humanPi, localDifferentPis, domPis): """ Implementation of the linear programming problem (Eq.2) in report 12.5 Returns the objective value and a reward function (which is only useful when the obj value is > 0) newPi is \hat{\pi} in the linear programming problem in the report. The robot tries to see if there exists a reward function where newPi is better than the best policy in domPis. """ m = CPlexModel() if not config.VERBOSE: m.setVerbosity(0) S = mdpH.S robotA = mdpR.A humanA = mdpH.A # index of states and actions Sr = range(len(S)) robotAr = range(len(robotA)) humanAr = range(len(humanA)) r = m.new(len(S), lb=0, ub=1, name='r') z = m.new(name='z') # when the optimal value is attained, z = \max_{domPi \in domPis} V^{domPi}_r for domPi in domPis: m.constrain(z >= sum([domPi[S[s], robotA[a]] * r[s] for s in Sr for a in robotAr])) # make sure r is consistent with humanPi for s in S: for a in humanA: # humanPi is better than a locally different policy which takes action a in state a m.constrain(sum(sum((humanPi[S[sp], humanA[ap]] - localDifferentPis[s, a][S[sp], humanA[ap]]) for ap in humanAr)\ * r[sp] for sp in Sr) >= 0) # maxi_r { V^{newPi}_r - \max_{domPi \in domPis} V^{domPi}_r } cplexObj = m.maximize(sum(newPi[S[s], robotA[a]] * r[s] for s in Sr for a in robotAr) - z) obj = sum([newPi[S[s], robotA[a]] * m[r][s] for s in Sr for a in robotAr]) - m[z] # the reward function has the same values for same states, but need to convert back to the S x A space rFunc = lambda s, a: m[r][Sr.index(s)] print 'cplexobj', cplexObj print 'obj', obj print 'newPi' printPi(newPi) print 'z', m[z], 'r', m[r] return obj, rFunc
def lpDualCPLEX(mdp, zeroConstraints=[], positiveConstraints=[], positiveConstraintsOcc=1): """ Solve the dual problem of lp, maybe with some constraints Same arguments Note that this is a lower level function that does not consider feature extraction. r should be a reward function, not a reward parameter. """ S = mdp.S A = mdp.A T = mdp.T r = mdp.r gamma = mdp.gamma alpha = mdp.alpha m = CPlexModel() if not config.VERBOSE: m.setVerbosity(0) # useful constants Sr = range(len(S)) Ar = range(len(A)) x = m.new((len(S), len(A)), lb=0, name='x') # make sure x is a valid occupancy for sp in Sr: # x (x(s) - \gamma * T) = \sigma m.constrain(sum(x[s, a] * ((s == sp) - gamma * T(S[s], A[a], S[sp])) for s in Sr for a in Ar) == alpha(S[sp])) # == constraints if len(zeroConstraints) > 0: m.constrain(sum(x[S.index(s), A.index(a)] for s, a in zeroConstraints) == 0) # >= constraints if len(positiveConstraints) > 0: m.constrain(sum(x[S.index(s), A.index(a)] for s, a in positiveConstraints) >= positiveConstraintsOcc) # obj try: obj = m.maximize(sum([x[s, a] * r(S[s], A[a]) for s in Sr for a in Ar])) except CPlexException as err: print 'Exception', err # we return obj value as None and occ measure as {}. this should be handled correctly return {'feasible': False} return {'feasible': True, 'obj': obj, 'pi': {(S[s], A[a]): m[x][s, a] for s in Sr for a in Ar}}