Exemple #1
0
def actionablestatesfrom(state):
    global maxdisseen
    t = tfs(state)
    nextdisvalues = np.round([
        modelt4.cocoef * disstates[dfs(state)] + modelt4.phi(t, customer, a)
        for a in actions if a >= modelt4.wholeprice(t)
    ], -2)
    maxdisseen = max(maxdisseen, max(nextdisvalues))
    nextdisvalues = [min(value, disstates[-1]) for value in nextdisvalues]
    idx = getdisstate(nextdisvalues)
    return list(set([(t) + i * (ntimeslots + 1) for i in idx]))
Exemple #2
0
def takeaction(state,n,greedy = True):
#    if greedy and np.random.random() <= min(epsilon, 10/iterations):
    if greedy and np.random.random() <= min(epsilon, 1000/iterations):
        price = 0
        while price < modelt4.wholeprice(tfs(state)):
            randomaction = np.random.randint(nactions)
            price = actions[randomaction]
        #randomprice = actions[randomaction]
        return randomaction
    else:
        bestaction = np.argmax(qmatrix[state,:])
        #bestprice = actions[bestaction]
        return bestaction
Exemple #3
0
# initialization
timeslot = 1
iterations = 0
qmatrix = np.zeros([nstates, nactions])  # one extra row
qprev = -100 * np.ones([nstates, nactions])  # one extra row
delta = 0.1
convergence = []
qconvergence = []
rewardgraph = []
initstate = getstate(1, initdis)
#qmatrix = np.full([ntimeslots+1,nactions], -np.inf) # one extra row
for i in range(nstates):
    for j in range(nactions):
        if tfs(i) <= ntimeslots:
            if actions[j] < modelt4.wholeprice(tfs(i)):
                qprev[i, j] = -1000
                qmatrix[i, j] = -100


def currentpolicy():
    bpolicy = list()
    policystate = initstate
    action = np.argmax(qmatrix[policystate, :])
    price = actions[action]
    bpolicy.append(price)
    #policystate = np.argmax(qmatrix[:,:], axis=1)
    for t in range(2, 25):
        policystate = nextstate(policystate, price)
        action = np.argmax(qmatrix[policystate, :])
        price = actions[action]