def actionablestatesfrom(state): global maxdisseen t = tfs(state) nextdisvalues = np.round([ modelt4.cocoef * disstates[dfs(state)] + modelt4.phi(t, customer, a) for a in actions if a >= modelt4.wholeprice(t) ], -2) maxdisseen = max(maxdisseen, max(nextdisvalues)) nextdisvalues = [min(value, disstates[-1]) for value in nextdisvalues] idx = getdisstate(nextdisvalues) return list(set([(t) + i * (ntimeslots + 1) for i in idx]))
def takeaction(state,n,greedy = True): # if greedy and np.random.random() <= min(epsilon, 10/iterations): if greedy and np.random.random() <= min(epsilon, 1000/iterations): price = 0 while price < modelt4.wholeprice(tfs(state)): randomaction = np.random.randint(nactions) price = actions[randomaction] #randomprice = actions[randomaction] return randomaction else: bestaction = np.argmax(qmatrix[state,:]) #bestprice = actions[bestaction] return bestaction
# initialization timeslot = 1 iterations = 0 qmatrix = np.zeros([nstates, nactions]) # one extra row qprev = -100 * np.ones([nstates, nactions]) # one extra row delta = 0.1 convergence = [] qconvergence = [] rewardgraph = [] initstate = getstate(1, initdis) #qmatrix = np.full([ntimeslots+1,nactions], -np.inf) # one extra row for i in range(nstates): for j in range(nactions): if tfs(i) <= ntimeslots: if actions[j] < modelt4.wholeprice(tfs(i)): qprev[i, j] = -1000 qmatrix[i, j] = -100 def currentpolicy(): bpolicy = list() policystate = initstate action = np.argmax(qmatrix[policystate, :]) price = actions[action] bpolicy.append(price) #policystate = np.argmax(qmatrix[:,:], axis=1) for t in range(2, 25): policystate = nextstate(policystate, price) action = np.argmax(qmatrix[policystate, :]) price = actions[action]