Example #1
0
def FeichterPolicy(mdp, start_state=0, epsilon=1, randomseed=None, delta=0.1):
    global c
    if (randomseed is not None):
        np.random.seed(randomseed)
    # orig_stdout = sys.stdout
    # f = open('Fiechter-m01.txt', 'w')
    # sys.stdout = f

    ##### Initialisation
    print(mdp.Vmax, 6 / epsilon, mdp.discountFactor)
    H = int((math.log(mdp.Vmax) + math.log(6.0 / epsilon)) /
            (1 - mdp.discountFactor))

    print("Chosen value of H is : ", H)
    N_h_s_a = np.zeros((H, mdp.numStates, mdp.numActions))
    N_h_s_a_s_prime = np.zeros(
        (H, mdp.numStates, mdp.numActions, mdp.numStates), dtype=np.int)
    rewards_s_a_sprime = np.zeros(
        (mdp.numStates, mdp.numActions, mdp.numStates))
    R_s_a = np.zeros((mdp.numStates, mdp.numActions))
    P_h_s_a_s_prime = np.zeros(
        (H, mdp.numStates, mdp.numActions, mdp.numStates))
    P_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates))
    P_lower_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates))
    policy_h_s = np.zeros((H, mdp.numStates), dtype=np.int)
    d_h_policy_s = np.zeros((H + 1, mdp.numStates))
    dmax = 12 * mdp.Vmax / (epsilon * (1 - mdp.discountFactor))
    converge_iterations = 10000
    epsilon_convergence = 1e-4

    Qlower = np.zeros((mdp.numStates, mdp.numActions))
    QlowerMBAE = np.zeros((mdp.numStates, mdp.numActions))
    QupperMBAE = mdp.Vmax * np.ones((mdp.numStates, mdp.numActions))
    Qupper = mdp.Vmax * np.random.random([mdp.numStates, mdp.numActions])
    QstarMBAE = (mdp.Vmax / 2) * np.ones((mdp.numStates, mdp.numActions))
    Qstar = (mdp.Vmax / 2) * np.ones((mdp.numStates, mdp.numActions))
    VupperMBAE = mdp.Vmax * np.ones((mdp.numStates))
    Vlower = np.zeros((mdp.numStates))
    VlowerMBAE = np.zeros((mdp.numStates))
    Vstar = (mdp.Vmax / 2) * np.ones((mdp.numStates))
    Vupper = mdp.Vmax * np.random.random([mdp.numStates])
    sampled_frequency_s_a = np.zeros((mdp.numStates, mdp.numActions))
    N_s_a_sprime = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates))
    it = 0
    samples = 0
    initial_iterations = 1 * mdp.numStates * mdp.numActions

    ### Initial sampling for all state action pairs
    while it < initial_iterations:
        for state in range(mdp.numStates):
            for act in range(mdp.numActions):
                it += 1
                s_prime, r = mdp.simulate(state, act)
                rewards_s_a_sprime[state][act][s_prime] += r
                R_s_a[state][act] = (r + R_s_a[state][act] *
                                     sampled_frequency_s_a[state][act]) / (
                                         sampled_frequency_s_a[state][act] + 1)
                sampled_frequency_s_a[state][act] += 1
                N_s_a_sprime[state][act][s_prime] += 1

    #### For starting the while loop below
    iteration = 1

    if (verbose == 0):
        outp = open(mdp.filename + '-fiechter' + str(randomseed) + '.txt',
                    'wb')
    # sys.stdout = open(mdp.filename+'-fiechter.txt', 'w+')
    ff = open(mdp.filename + '-fiechter-samples.txt', 'w+')

    #### Exploration
    # while d_h_policy_s[0][start_state]>2/(1-mdp.discountFactor) or iteration==1:
    acList = bestTwoActions(mdp, start_state, Qlower, Qupper, Qstar)
    coll = Qupper[start_state][acList[1]] - Qlower[start_state][
        acList[0]] - epsilon * (1 - mdp.discountFactor) / 2
    while coll > 0 or iteration < 50:
        # print d_h_policy_s[0][start_state], " > ", 2/(1-mdp.discountFactor)
        # print policy_h_s[0]
        h = 0
        current_state = start_state
        while h < H:
            current_action = policy_h_s[h][current_state]
            # print "------>",current_state, current_action
            s_prime, r = mdp.simulate(current_state, current_action)
            N_h_s_a[h][current_state][current_action] += 1
            rewards_s_a_sprime[current_state][current_action][s_prime] += r
            R_s_a[state][act] = (
                r + R_s_a[state][act] * sampled_frequency_s_a[state][act]) / (
                    sampled_frequency_s_a[state][act] + 1)
            N_h_s_a_s_prime[h][current_state][current_action][s_prime] += 1
            N_s_a_sprime[current_state][current_action][s_prime] += 1
            sampled_frequency_s_a[current_state][current_action] += 1
            for s2 in range(mdp.numStates):
                P_h_s_a_s_prime[h][current_state][current_action][
                    s2] = N_h_s_a_s_prime[h][current_state][current_action][
                        s2] / N_h_s_a[h][current_state][current_action]
            h += 1
            current_state = s_prime
            samples += 1
            if (samples % 100 == 0):
                acList = bestTwoActions(mdp, start_state, QlowerMBAE,
                                        QupperMBAE, QstarMBAE)
                if (verbose == 0):
                    outp.write(str(samples))
                    outp.write('\t')
                    outp.write(
                        str(QupperMBAE[start_state][acList[1]] -
                            QlowerMBAE[start_state][acList[0]])
                    )  #-epsilon*(1-mdp.discountFactor)/2
                    outp.write('\n')
                else:
                    print(Qupper[start_state], Qlower[start_state])
                    # print d_h_policy_s[0][start_state]-2/(1-mdp.discountFactor)
                    # print samples, (QupperMBAE[start_state][acList[1]]-QlowerMBAE[start_state][acList[0]])-epsilon*(1-mdp.discountFactor)/2
                np.savetxt(ff, sampled_frequency_s_a, delimiter=',')
                ff.write('\n')
                # print samples, d_h_policy_s[0][start_state]-2/(1-mdp.discountFactor)

        # Compute new policy dynamic program
        e_s_a = np.zeros((mdp.numStates, mdp.numActions))
        for h in range(H - 1, -1, -1):
            for state in range(mdp.numStates):
                current_max = -float("inf")
                argmax_action = -1
                for act in range(mdp.numActions):
                    if (N_h_s_a[h][state][act] == 0):
                        e_s_a[state][act] = dmax
                    else:
                        sqterm = (2 * math.log(
                            4 * H * mdp.numStates * mdp.numActions) -
                                  2 * math.log(delta)) / N_h_s_a[h][state][act]
                        summation = np.sum(
                            (N_h_s_a_s_prime[h][state][act] /
                             N_h_s_a[h][state][act]) * d_h_policy_s[h + 1])
                        secondterm = mdp.discountFactor * summation
                        e_s_a[state][act] = min(
                            dmax, 6 * mdp.Vmax * (math.sqrt(sqterm)) /
                            (epsilon * (1 - delta)) + secondterm)

                policy_h_s[h][state] = np.argmax(e_s_a[state])
                d_h_policy_s[h][state] = np.amax(e_s_a[state])

        # Compute MBAE QupperMBAE and QlowerMBAE bounds
        for internal in range(converge_iterations):
            oldQlowerMBAE = np.copy(QlowerMBAE[start_state])
            for state in range(mdp.numStates):
                for act in range(mdp.numActions):
                    # Calculations for QupperMBAE and QlowerMBAE
                    # Calculations for QupperMBAE and QlowerMBAE
                    firstterm = np.sum(rewards_s_a_sprime[state][act]
                                       ) / sampled_frequency_s_a[state][act]
                    secondterm = mdp.discountFactor * np.sum(
                        VupperMBAE * (N_s_a_sprime[state][act] /
                                      sampled_frequency_s_a[state][act]))
                    #secondterm = mdp.discountFactor*sum(VupperMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates))
                    lower_secondterm = mdp.discountFactor * np.sum(
                        VlowerMBAE * (N_s_a_sprime[state][act] /
                                      sampled_frequency_s_a[state][act]))
                    star_secondterm = mdp.discountFactor * np.sum(
                        Vstar * (N_s_a_sprime[state][act] /
                                 sampled_frequency_s_a[state][act]))
                    #lower_secondterm = mdp.discountFactor*sum(VlowerMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates))
                    thirdterm = mdp.Vmax * math.sqrt(
                        (math.log(c * (samples**2) * mdp.numStates *
                                  mdp.numActions) - math.log(delta)) /
                        sampled_frequency_s_a[state][act])
                    #QupperMBAE[state][act] = (float)(sum(rewards_s_a_sprime[state][act][ss] for ss in range(mdp.numStates))/sampled_frequency_s_a[state][act]) + secondterm + thirdterm
                    QupperMBAE[state][act] = firstterm + secondterm + thirdterm
                    QlowerMBAE[state][
                        act] = firstterm + lower_secondterm - thirdterm
                    QstarMBAE[state][act] = firstterm + star_secondterm
                    # Calculation for Vstar
                    # t = (float)N_s_a_sprime[state][act][stateprime]/sampled_frequency_s_a[state][act]
                    # val = t*(rewards_s_a[state][act][stateprime]+mdp.discountFactor*Vstar[stateprime])
                VupperMBAE[state] = np.amax(QupperMBAE[state])
                VlowerMBAE[state] = np.amax(QlowerMBAE[state])
                Vstar[state] = np.amax(QstarMBAE[state])
            if (np.linalg.norm(oldQlowerMBAE - QlowerMBAE[start_state]) <=
                    epsilon_convergence):
                break

        for i in range(mdp.numStates):
            for j in range(mdp.numActions):
                if (sampled_frequency_s_a[i][j] > 0):
                    P_tilda[i][j] = UpperP(i, j, delta, N_s_a_sprime[i][j],
                                           mdp.numStates, Vupper, False)
                    P_lower_tilda[i][j] = LowerP(i, j, delta,
                                                 N_s_a_sprime[i][j],
                                                 mdp.numStates, Vlower, False)

        Qupper, Vupper = iteratedConvergence(Qupper, R_s_a, P_tilda,
                                             mdp.discountFactor, epsilon,
                                             converge_iterations,
                                             epsilon_convergence)
        Qlower, Vlower = iteratedConvergence(Qlower, R_s_a, P_lower_tilda,
                                             mdp.discountFactor, epsilon,
                                             converge_iterations,
                                             epsilon_convergence)
        # Qstar, _ = iteratedConvergence(Qstar,R_s_a,P_,mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence)

        iteration += 1
        acList = bestTwoActions(mdp, start_state, QlowerMBAE, QupperMBAE,
                                QstarMBAE)
        coll = QupperMBAE[start_state][acList[1]] - QlowerMBAE[start_state][
            acList[0]] - epsilon * (1 - mdp.discountFactor) / 2
    # sys.stdout = orig_stdout
    # f.close()
    print(iteration)
    a = open('final' + mdp.filename + '-fiechter.txt', 'a+')
    a.write(str(iteration) + '\n')
    a.close()
    return getBestPolicy(mdp, rewards_s_a_sprime, P_h_s_a_s_prime[0])
    # return policy_h_s[0]
Example #2
0
def LUCBEpisodic(mdp,
                 start_state=0,
                 epsilon=4,
                 randomseed=None,
                 delta=0.1,
                 fileprint=1):
    if (randomseed is not None):
        np.random.seed(randomseed)
    global MAX_ITERATION_LIMIT, c
    iteration = 0
    it = 0
    H = int((math.log(mdp.Vmax) + math.log(6.0 / epsilon)) /
            (1 - mdp.discountFactor))
    initial_iterations = 1 * mdp.numStates * mdp.numActions
    rewards_s_a_sprime = np.zeros(
        (mdp.numStates, mdp.numActions, mdp.numStates))
    R_s_a = np.zeros((mdp.numStates, mdp.numActions))
    sampled_frequency_s_a = np.zeros((mdp.numStates, mdp.numActions))
    N_s_a_sprime = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates))
    P = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates))
    P_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates))
    P_lower_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates))
    VlowerMBAE = np.zeros((mdp.numStates))
    Vlower = np.zeros((mdp.numStates))
    Vstar = (mdp.Vmax / 2) * np.ones((mdp.numStates))
    Vupper = mdp.Vmax * np.random.random([mdp.numStates])
    Qlower = np.zeros((mdp.numStates, mdp.numActions))
    VupperMBAE = mdp.Vmax * np.ones((mdp.numStates))
    QlowerMBAE = np.zeros((mdp.numStates, mdp.numActions))
    Qstar = (mdp.Vmax / 2) * np.ones((mdp.numStates, mdp.numActions))
    QupperMBAE = mdp.Vmax * np.ones((mdp.numStates, mdp.numActions))
    Qupper = mdp.Vmax * np.random.random([mdp.numStates, mdp.numActions])
    final_policy = (-1) * np.ones((mdp.numStates), dtype=np.int)
    states_to_sample = range(mdp.numStates)
    colliding_values = np.zeros((mdp.numStates))
    is_converged = 0
    print "Vmax", mdp.Vmax
    print "Epsilon is ", epsilon

    ### Initial sampling for all state action pairs
    while it < initial_iterations:
        for state in range(mdp.numStates):
            for act in range(mdp.numActions):
                it += 1
                s_prime, r = mdp.simulate(state, act)
                rewards_s_a_sprime[state][act][s_prime] += r
                R_s_a[state][act] = (r + R_s_a[state][act] *
                                     sampled_frequency_s_a[state][act]) / (
                                         sampled_frequency_s_a[state][act] + 1)
                sampled_frequency_s_a[state][act] += 1
                N_s_a_sprime[state][act][s_prime] += 1
                for s2 in range(mdp.numStates):
                    P[state][act][s2] = (float)(
                        N_s_a_sprime[state][act]
                        [s2]) / sampled_frequency_s_a[state][act]

    ### Calculating V, Q estimates thus far MBAE
    for internal in range(converge_iterations):
        oldQlowerMBAE = np.copy(QlowerMBAE[start_state])
        for state in range(mdp.numStates):
            for act in range(mdp.numActions):
                # Calculations for QupperMBAE and QlowerMBAE
                firstterm = np.sum(rewards_s_a_sprime[state]
                                   [act]) / sampled_frequency_s_a[state][act]
                secondterm = mdp.discountFactor * np.sum(
                    VupperMBAE * (N_s_a_sprime[state][act] /
                                  sampled_frequency_s_a[state][act]))
                lower_secondterm = mdp.discountFactor * np.sum(
                    VlowerMBAE * (N_s_a_sprime[state][act] /
                                  sampled_frequency_s_a[state][act]))
                star_secondterm = mdp.discountFactor * np.sum(
                    Vstar * (N_s_a_sprime[state][act] /
                             sampled_frequency_s_a[state][act]))
                thirdterm = mdp.Vmax * math.sqrt(
                    (math.log(c * mdp.numStates * mdp.numActions) -
                     math.log(delta)) / sampled_frequency_s_a[state][act])
                QupperMBAE[state][act] = firstterm + secondterm + thirdterm
                QlowerMBAE[state][
                    act] = firstterm + lower_secondterm - thirdterm
                Qstar[state][act] = firstterm + star_secondterm
            VupperMBAE[state] = np.amax(QupperMBAE[state])
            VlowerMBAE[state] = np.amax(QlowerMBAE[state])
            Vstar[state] = np.amax(Qstar[state])

        if (np.linalg.norm(oldQlowerMBAE - QlowerMBAE[start_state]) <=
                epsilon_convergence):
            print "Stopping with ", internal, "initial internal iterations"
            break

    if internal == converge_iterations:
        print "Used all iterations"

    print "Initial estimate of QupperMBAE found! Now sampling"

    Qupper = np.copy(QupperMBAE)
    Qlower = np.copy(QlowerMBAE)

    if (verbose == 0):
        outp = open(mdp.filename + '-lucbeps' + str(randomseed) + '.txt', 'wb')
    ff = open(mdp.filename + '-lucbeps-samples.txt', 'w+')

    h = 0
    state1 = start_state

    iteration += initial_iterations

    while iteration < MAX_ITERATION_LIMIT:
        max_collision_state = [
            sorted(states_to_sample,
                   key=lambda x: colliding_values[x],
                   reverse=True)[0]
        ]
        if (h % H == 0):
            state1 = start_state
            h = 0
        else:
            state1 = nextstate
        actionsList = bestTwoActions(mdp, state1, QlowerMBAE, QupperMBAE,
                                     Qstar)
        a = np.random.choice(actionsList)
        iteration += 1

        for t in range(1):
            s_prime, r = mdp.simulate(state1, a)
            nextstate = s_prime
            rewards_s_a_sprime[state1][a][s_prime] += r
            R_s_a[state1][act] = (r + R_s_a[state1][act] *
                                  sampled_frequency_s_a[state1][act]) / (
                                      sampled_frequency_s_a[state1][act] + 1)
            sampled_frequency_s_a[state1][a] += 1
            N_s_a_sprime[state1][a][s_prime] += 1
            if (verbose == 1):
                pass
                # print "s, a, sprime"
                # print state1, a, s_prime
            for s2 in range(mdp.numStates):
                P[state1][act][s2] = (float)(
                    N_s_a_sprime[state1][act]
                    [s2]) / sampled_frequency_s_a[state1][act]

        ## Calculating Q and V values
        for i in range(mdp.numStates):
            for j in range(mdp.numActions):
                if (sampled_frequency_s_a[i][j] > 0):
                    P_tilda[i][j] = UpperP(i, j, delta, N_s_a_sprime[i][j],
                                           mdp.numStates, Vupper, False)
                    P_lower_tilda[i][j] = LowerP(i, j, delta,
                                                 N_s_a_sprime[i][j],
                                                 mdp.numStates, Vlower, False)

        if (verbose == 1):
            pass

        Qupper, Vupper = iteratedConvergence(Qupper, R_s_a, P_tilda,
                                             mdp.discountFactor, epsilon,
                                             converge_iterations,
                                             epsilon_convergence)
        Qlower, Vlower = iteratedConvergence(Qlower, R_s_a, P_lower_tilda,
                                             mdp.discountFactor, epsilon,
                                             converge_iterations,
                                             epsilon_convergence)

        if (verbose == 1):
            # print "Calculated Q values are :"
            print QupperMBAE[start_state], Qstar[start_state], QlowerMBAE[
                start_state]

        # Calculations for QupperMBAE and QlowerMBAE
        #### This involved a two for-loop and iterating convergence
        for internal in range(converge_iterations):
            oldQlowerMBAE = np.copy(QlowerMBAE[start_state])
            for state in range(mdp.numStates):
                for act in range(mdp.numActions):
                    # Calculations for QupperMBAE and QlowerMBAE
                    firstterm = np.sum(rewards_s_a_sprime[state][act]
                                       ) / sampled_frequency_s_a[state][act]
                    secondterm = mdp.discountFactor * np.sum(
                        VupperMBAE * (N_s_a_sprime[state][act] /
                                      sampled_frequency_s_a[state][act]))
                    lower_secondterm = mdp.discountFactor * np.sum(
                        VlowerMBAE * (N_s_a_sprime[state][act] /
                                      sampled_frequency_s_a[state][act]))
                    star_secondterm = mdp.discountFactor * np.sum(
                        Vstar * (N_s_a_sprime[state][act] /
                                 sampled_frequency_s_a[state][act]))
                    thirdterm = mdp.Vmax * math.sqrt(
                        (math.log(c * (iteration**2) * mdp.numStates *
                                  mdp.numActions) - math.log(delta)) /
                        sampled_frequency_s_a[state][act])
                    QupperMBAE[state][act] = firstterm + secondterm + thirdterm
                    QlowerMBAE[state][
                        act] = firstterm + lower_secondterm - thirdterm
                    Qstar[state][act] = firstterm + star_secondterm
                VupperMBAE[state] = np.amax(QupperMBAE[state])
                VlowerMBAE[state] = np.amax(QlowerMBAE[state])
                Vstar[state] = np.amax(Qstar[state])
            if (np.linalg.norm(oldQlowerMBAE - QlowerMBAE[start_state]) <=
                    epsilon_convergence):
                break

        count = 0
        if (iteration % 100 == 0):
            acList = bestTwoActions(mdp, start_state, Qlower, Qupper, Qstar)
            # print "Qupper, Qstar, Qlower"
            # print Qupper[start_state], Qstar[start_state], Qlower[start_state]
            if (verbose == 0):
                outp.write(str(iteration))
                outp.write('\t')
                outp.write(
                    str(QupperMBAE[start_state][acList[1]] -
                        QlowerMBAE[start_state][acList[0]])
                )  #-epsilon*(1-mdp.discountFactor)/2
                outp.write('\n')
            else:
                print iteration, QupperMBAE[start_state][
                    acList[1]] - QlowerMBAE[start_state][acList[0]]
            np.savetxt(ff, sampled_frequency_s_a, delimiter=',')
            ff.write('\n')

        ##### Updating the list of coliliding states
        if (iteration > 50):
            states_to_sample = []
            for st in range(mdp.numStates):
                acList = bestTwoActions(mdp, st, QlowerMBAE, QupperMBAE, Qstar)
                ##### Changing stopping condition to epsilon*(1-gamma)/2
                colliding_values[st] = QupperMBAE[st][acList[1]] - QlowerMBAE[
                    st][acList[0]] - epsilon * (1 - mdp.discountFactor) / 2
                if (colliding_values[st] > 0):
                    ### this state is still colliding, add to sample states
                    states_to_sample.append(st)
        else:
            # for st in range(mdp.numStates):
            # 	acList = bestTwoActions(mdp, st, Qlower, Qupper, Qstar)
            # 	colliding_values[st] = Qupper[st][acList[1]]-Qlower[st][acList[0]]-epsilon*(1-mdp.discountFactor)/2
            colliding_values = range(mdp.numStates)
            states_to_sample = range(mdp.numStates)

        #### Check epsilon condition for only starting state
        if (not (start_state in states_to_sample) and iteration > 50):
            # if(count==mdp.numStates):
            acList = bestTwoActions(mdp, start_state, QlowerMBAE, QupperMBAE,
                                    Qstar)
            print "Difference is ", Qupper[st][acList[1]] - Qlower[st][
                acList[0]]
            print "Setting final_policy of ", start_state, " to", acList[0]
            final_policy[start_state] = acList[0]
            print "Iterations taken : ", iteration
            for i in range(mdp.numStates):
                if (final_policy[i] == -1):
                    final_policy[i] = bestTwoActions(mdp, i, QlowerMBAE,
                                                     QupperMBAE, Qstar)[0]
            print "Returning policy : ", final_policy

            if (iteration != 51):
                a = open('final' + mdp.filename + '-lucbeps.txt', 'a+')
                a.write(str(iteration) + '\n')
                a.close()
            return final_policy

        h += 1

    outp.close()
    ff.close()

    for i in range(mdp.numStates):
        if (final_policy[i] == -1):
            final_policy[i] = bestTwoActions(mdp, i, QlowerMBAE, QupperMBAE,
                                             Qstar)[0]
    return final_policy
Example #3
0
def mbie(mdp, start_state=0, epsilon=4, randomseed=None, delta=0.1):

	global c
	if(randomseed is not None):
		np.random.seed(randomseed)
	initial_iterations = 1*mdp.numStates*mdp.numActions
	### Estimate the horizon based on Fiechter
	H = int((math.log(mdp.Vmax) + math.log(6.0/epsilon))/(1-mdp.discountFactor))
	it=0
	samples = 0

	### Calculating m based on the parameters

	first_term = mdp.numStates/(epsilon**2*(1-mdp.discountFactor)**4)
	second_term = math.log(mdp.numStates*mdp.numActions/(epsilon*(1-mdp.discountFactor)*delta))/(epsilon**2*(1-mdp.discountFactor)**4)
	m = c*(first_term+second_term)
	print "Chosen value of m is :",H, m
	N_s_a = np.zeros((mdp.numStates,mdp.numActions))
	N_s_a_sprime = np.zeros((mdp.numStates,mdp.numActions,mdp.numStates))
	P_s_a_sprime = np.zeros((mdp.numStates,mdp.numActions,mdp.numStates))
	P_tilda = np.zeros((mdp.numStates,mdp.numActions,mdp.numStates))
	P_lower_tilda = np.zeros((mdp.numStates,mdp.numActions,mdp.numStates))
	R_s_a = np.zeros((mdp.numStates,mdp.numActions))
	Qupper = mdp.Vmax*np.random.random([mdp.numStates,mdp.numActions])
	QupperMBAE = mdp.Vmax*np.ones((mdp.numStates,mdp.numActions))
	Qlower = np.zeros((mdp.numStates,mdp.numActions))
	QlowerMBAE = np.zeros((mdp.numStates,mdp.numActions))
	Qstar = (mdp.Vmax/2)*np.ones((mdp.numStates,mdp.numActions))
	Vupper = mdp.Vmax*np.random.random([mdp.numStates])
	VupperMBAE = mdp.Vmax*np.ones((mdp.numStates))
	Vlower = np.zeros((mdp.numStates))
	VlowerMBAE = np.zeros((mdp.numStates))
	Vstar = (mdp.Vmax/2)*np.ones((mdp.numStates))
	best_policy = (-1)*np.ones((mdp.numStates), dtype=np.int)

	### Initial sampling for all state action pairs
	while it < initial_iterations:
		for state in range(mdp.numStates):
			for act in range(mdp.numActions):
				it+=1
				ss, rr = mdp.simulate(state, act)
				R_s_a[state][act] = rr
				N_s_a[state][act] += 1
				N_s_a_sprime[state][act][ss] += 1
				# P_s_a_sprime = np.copy(N_s_a_sprime)
				for s2 in range(mdp.numStates):
					P_s_a_sprime[state][act][s2] = (float)(N_s_a_sprime[state][act][s2])/N_s_a[state][act]

	samples += initial_iterations
	print P_s_a_sprime
	print "Completed initial iterations"
	Qupper, Vupper = iteratedConvergence(Qupper,R_s_a,P_s_a_sprime,mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence)
	print Qupper, "Qupper"
	# print Qupper, Vupper
	current_state = start_state
	### Repeat forever

	if(verbose==0):
		outp = open(mdp.filename+'-mbie' + str(randomseed) +'.txt', 'wb')
	# sys.stdout = open(mdp.filename+'-mbie.txt', 'w+')
	ff = open(mdp.filename+'-mbie-samples.txt', 'w+')

	while samples<MAX_ITERATION_LIMIT:
		current_state = start_state
		h=1
		# print Qupper[start_state], Qstar[start_state], Qlower[start_state]
		while h<=H:
			if(samples%100==0):
				acList = bestTwoActions(mdp, start_state, QlowerMBAE, QupperMBAE, Qstar)
				if(verbose==0):
					outp.write(str(samples))
					outp.write('\t')
					outp.write(str(QupperMBAE[start_state][acList[1]]-QlowerMBAE[start_state][acList[0]]))#-epsilon*(1-mdp.discountFactor)/2 
					outp.write('\n')
					print samples, (QupperMBAE[start_state][acList[1]]-QlowerMBAE[start_state][acList[0]]) 
				else:
					print samples, (QupperMBAE[start_state][acList[1]],QlowerMBAE[start_state][acList[0]]) 
					pass
				np.savetxt(ff, N_s_a, delimiter=',')
				ff.write('\n')
			for i in range(mdp.numStates):
				# print "For state ", i, " doing UpperP"
				for j in range(mdp.numActions):
					P_tilda[i][j] = UpperP(i,j,delta,N_s_a_sprime[i][j],mdp.numStates,Vupper,False)
					P_lower_tilda[i][j] = LowerP(i,j,delta,N_s_a_sprime[i][j],mdp.numStates,Vlower,False)
			# print "Starting iterating"
			# print Qupper
			# return 2
			Qupper, Vupper = iteratedConvergence(Qupper,R_s_a,P_tilda,mdp.discountFactor,epsilon, converge_iterations, epsilon_convergence)
			Qlower, Vlower = iteratedConvergence(Qlower,R_s_a,P_lower_tilda,mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence)
			current_action = np.argmax(QupperMBAE[current_state])
			# print Qupper[start_state], Qlower[start_state]
			best_policy[current_state] = current_action
			if(N_s_a[current_state][current_action]<m):
				for t in range(1):
					ss,rr = mdp.simulate(current_state, current_action)
					R_s_a[current_state][current_action] = (rr + R_s_a[current_state][current_action]*N_s_a[current_state][current_action])/(N_s_a[current_state][current_action]+1)
					N_s_a[current_state][current_action] += 1
					N_s_a_sprime[current_state][current_action][ss] += 1
					samples += 1
				for s2 in range(mdp.numStates):
					# print current_state, current_action, s2, N_s_a_sprime[current_state][current_action][s2], N_s_a[current_state][current_action]
					P_s_a_sprime[state][act][s2] = (float)(N_s_a_sprime[state][act][s2])/N_s_a[state][act]
				current_state = ss

			else:
				print "TRUEEEE"
				print N_s_a[current_state]
				# print P_s_a_sprime[current_state][current_action]
				# print np.sum(P_s_a_sprime[current_state][current_action])
				# print N_s_a[current_state][current_action]
				current_state = np.random.choice(np.arange(mdp.numStates), p=P_s_a_sprime[current_state][current_action]/np.sum(P_s_a_sprime[current_state][current_action]))

		
			h += 1
		
		# Compute MBAE Qupper and Qlower bounds
		for internal in range(converge_iterations):
			oldQlower = np.copy(QlowerMBAE[start_state])
			for state in range(mdp.numStates):
				for act in range(mdp.numActions):
					# Calculations for Qupper and Qlower mBAE
					firstterm = R_s_a[state][act]
					secondterm = mdp.discountFactor*np.sum(VupperMBAE*(N_s_a_sprime[state][act]/N_s_a[state][act]))
					#secondterm = mdp.discountFactor*sum(Vupper[ss]*N_s_a_sprime[state][act][ss]/N_s_a[state][act] for ss in range(mdp.numStates))  
					lower_secondterm = mdp.discountFactor*np.sum(VlowerMBAE*(N_s_a_sprime[state][act]/N_s_a[state][act]))
					star_secondterm = mdp.discountFactor*np.sum(Vstar*(N_s_a_sprime[state][act]/N_s_a[state][act]))
					#lower_secondterm = mdp.discountFactor*sum(Vlower[ss]*N_s_a_sprime[state][act][ss]/N_s_a[state][act] for ss in range(mdp.numStates))  
					thirdterm = mdp.Vmax*math.sqrt((math.log(c*(samples**2)*mdp.numStates*mdp.numActions)-math.log(delta))/N_s_a[state][act])
					#Qupper[state][act] = (float)(sum(rewards_s_a_sprime[state][act][ss] for ss in range(mdp.numStates))/N_s_a[state][act]) + secondterm + thirdterm
					QupperMBAE[state][act] = firstterm + secondterm + thirdterm
					# print firstterm, secondterm, thirdterm
					QlowerMBAE[state][act] = firstterm + lower_secondterm - thirdterm
					Qstar[state][act] = firstterm + star_secondterm
					# Calculation for Vstar
					# t = (float)N_s_a_sprime[state][act][stateprime]/N_s_a[state][act]
					# val = t*(rewards_s_a[state][act][stateprime]+mdp.discountFactor*Vstar[stateprime])
				VupperMBAE[state] = np.amax(QupperMBAE[state])
				VlowerMBAE[state] = np.amax(QlowerMBAE[state])
				Vstar[state] = np.amax(Qstar[state])
			if(np.linalg.norm(oldQlower-QlowerMBAE[start_state])<=epsilon_convergence):
				# print "Stopping with ", internal, "iterations"
				break		


	return best_policy
Example #4
0
def ddvouu(mdp, start_state=0, epsilon=4, randomseed=None, delta=0.1):

    if (randomseed is not None):
        np.random.seed(randomseed)

    initial_iterations = 1 * mdp.numStates * mdp.numActions
    ### Estimate the horizon based on Fiechter

    c = 1
    it = 0
    samples = 0

    first_term = mdp.numStates / (epsilon**2 * (1 - mdp.discountFactor)**4)
    second_term = math.log(
        mdp.numStates * mdp.numActions /
        (epsilon *
         (1 - mdp.discountFactor) * delta)) / (epsilon**2 *
                                               (1 - mdp.discountFactor)**4)
    m = c * (first_term + second_term)
    delta = delta / (mdp.numStates * mdp.numActions * m)
    print("Chosen value of m is :", m)
    N_s_a = np.zeros((mdp.numStates, mdp.numActions), dtype=np.int)
    N_s_a_sprime = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates),
                            dtype=np.int)
    P_s_a_sprime = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates))
    P_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates))
    P_lower_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates))
    R_s_a = np.zeros((mdp.numStates, mdp.numActions))
    Qupper = mdp.Vmax * np.ones((mdp.numStates, mdp.numActions))
    QupperMBAE = mdp.Vmax * np.ones((mdp.numStates, mdp.numActions))
    Qlower = np.zeros((mdp.numStates, mdp.numActions))
    Qstar = (mdp.Vmax / 2) * np.ones((mdp.numStates, mdp.numActions))
    QlowerMBAE = np.zeros((mdp.numStates, mdp.numActions))
    Vupper = mdp.Vmax * np.ones((mdp.numStates))
    VupperMBAE = mdp.Vmax * np.ones((mdp.numStates))
    Vlower = np.zeros((mdp.numStates))
    VlowerMBAE = np.zeros((mdp.numStates))
    Vstar = (mdp.Vmax / 2) * np.ones((mdp.numStates))
    best_policy = (-1) * np.ones((mdp.numStates), dtype=np.int)
    deltadeltaV = np.zeros((mdp.numStates, mdp.numActions))
    discovered_states = set([start_state, 1, 2, 3, 4])

    ## Initial sampling for all state action pairs
    ### Is this needed?
    while it < initial_iterations:
        for state in range(mdp.numStates):
            for act in range(mdp.numActions):
                it += 1

                ss, rr = mdp.simulate(state, act)
                print("Sampling ", state, act, rr, ss)
                R_s_a[state][act] = (rr + R_s_a[state][act] * N_s_a[state][act]
                                     ) / (N_s_a[state][act] + 1)
                N_s_a[state][act] += 1
                N_s_a_sprime[state][act][ss] += 1
                # P_s_a_sprime = np.copy(N_s_a_sprime)
                for s2 in range(mdp.numStates):
                    P_s_a_sprime[state][act][s2] = (float)(
                        N_s_a_sprime[state][act][s2]) / N_s_a[state][act]
    samples += initial_iterations

    print(P_s_a_sprime)
    print("Completed initial iterations")

    if (verbose == 0):
        outp = open(mdp.filename + '-ddv' + str(randomseed) + '.txt', 'wb')
    # sys.stdout = open(mdp.filename+'-ddv.txt', 'w+')
    ff = open(mdp.filename + '-ddv-samples.txt', 'w+')

    # print Qupper, Vupper
    current_state = start_state
    ### Repeat forever
    while samples < MAX_ITERATION_LIMIT:
        # print Qupper[start_state], Qlower[start_state]
        for i in range(mdp.numStates):
            for j in range(mdp.numActions):
                if (N_s_a[i][j] > 0):
                    P_tilda[i][j] = UpperP(i, j, delta, N_s_a_sprime[i][j],
                                           mdp.numStates, Vupper, False)
                    P_lower_tilda[i][j] = LowerP(i, j, delta,
                                                 N_s_a_sprime[i][j],
                                                 mdp.numStates, Vlower, False)

        ##Calculate Q values
        Qupper, Vupper = iteratedConvergence(Qupper, R_s_a, P_tilda,
                                             mdp.discountFactor, epsilon,
                                             converge_iterations,
                                             epsilon_convergence)
        Qlower, Vlower = iteratedConvergence(Qlower, R_s_a, P_lower_tilda,
                                             mdp.discountFactor, epsilon,
                                             converge_iterations,
                                             epsilon_convergence)

        current_state = start_state

        ### Terminating condition
        if (use_mbae):
            acList = bestTwoActions(mdp, start_state, QlowerMBAE, QupperMBAE,
                                    Qstar)
            coll = QupperMBAE[start_state][
                acList[1]] - QlowerMBAE[start_state][
                    acList[0]] - epsilon * (1 - mdp.discountFactor) / 2
        else:
            acList = bestTwoActions(mdp, start_state, Qlower, Qupper, Qstar)
            coll = Qupper[start_state][acList[1]] - Qlower[start_state][
                acList[0]] - epsilon * (1 - mdp.discountFactor) / 2
        # if(Vupper[start_state]-Vlower[start_state]<=epsilon and samples>50):
        if (coll < 0 and samples > 50):
            a = open('final' + mdp.filename + '-ddv.txt', 'a+')
            a.write(str(samples) + '\n')
            a.close()
            print(Qupper[start_state], Vupper[start_state],
                  Vlower[start_state])
            policy_lower = np.argmax(Qlower, axis=1)
            print("Iteration number ", samples)
            print("Returning policy because of epsilon-convergence")
            print(policy_lower)
            print(np.argmax(QupperMBAE, axis=1))
            print(np.argmax(Qupper, axis=1))
            print(np.argmax(QlowerMBAE, axis=1))
            print(np.argmax(Qstar, axis=1))
            return policy_lower

        ## Caclulate deldelV for all states
        if (use_mbae):
            for st in list(discovered_states):
                for ac in range(mdp.numActions):
                    #### Compute del del V
                    deltadeltaV[st][ac] = CalculateDelDelV(
                        st, ac, mdp, N_s_a_sprime, QupperMBAE, QlowerMBAE,
                        VupperMBAE, VlowerMBAE, start_state, P_s_a_sprime,
                        P_tilda, P_lower_tilda, R_s_a, epsilon, delta,
                        converge_iterations, epsilon_convergence)
        else:
            for st in list(discovered_states):
                for ac in range(mdp.numActions):
                    #### Compute del del V
                    deltadeltaV[st][ac] = CalculateDelDelV(
                        st, ac, mdp, N_s_a_sprime, Qupper, Qlower, Vupper,
                        Vlower, start_state, P_s_a_sprime, P_tilda,
                        P_lower_tilda, R_s_a, epsilon, delta,
                        converge_iterations, epsilon_convergence)

        #### Simulate greedily wrt deldelV
        # print np.unravel_index(deltadeltaV.argmax(), deltadeltaV.shape)
        current_state, current_action = np.unravel_index(
            deltadeltaV.argmax(), deltadeltaV.shape)
        #time.sleep(0.1)
        print(deltadeltaV)
        ss, rr = mdp.simulate(current_state, current_action)
        samples += 1
        print("Sampling ", current_state, current_action, rr, ss)

        #### Add received state to the set of discovered states
        #discovered_states.add(ss)
        print(discovered_states)
        ### Update believed model
        R_s_a[current_state][current_action] = (
            rr + R_s_a[current_state][current_action] *
            N_s_a[current_state][current_action]) / (
                N_s_a[current_state][current_action] + 1)
        N_s_a[current_state][current_action] += 1
        N_s_a_sprime[current_state][current_action][ss] += 1

        for s2 in range(mdp.numStates):
            # print current_state, current_action, s2, N_s_a_sprime[current_state][current_action][s2], N_s_a[current_state][current_action]
            P_s_a_sprime[current_state][current_action][s2] = (float)(
                N_s_a_sprime[current_state][current_action]
                [s2]) / N_s_a[current_state][current_action]

        if (samples % 100 == 0):
            if (use_mbae):
                acList = bestTwoActions(mdp, start_state, QlowerMBAE,
                                        QupperMBAE, Qstar)
            else:
                acList = bestTwoActions(mdp, start_state, Qlower, Qupper,
                                        Qstar)
            if (verbose == 0):
                outp.write(str(samples))
                outp.write('\t')
                if (plot_vstar):
                    outp.write(str(Vstar[start_state]))
                else:
                    if (use_mbae):
                        outp.write(
                            str(QupperMBAE[start_state][acList[1]] -
                                QlowerMBAE[start_state][acList[0]]))
                    else:
                        outp.write(
                            str(Qupper[start_state][acList[1]] -
                                Qlower[start_state][acList[0]]))
                outp.write('\n')
                if (use_mbae):
                    print(samples, (QupperMBAE[start_state][acList[1]] -
                                    QlowerMBAE[start_state][acList[0]]))
                else:
                    print(samples, (Qupper[start_state][acList[1]] -
                                    Qlower[start_state][acList[0]]))
            else:
                print(samples, (QupperMBAE[start_state][acList[1]] -
                                QlowerMBAE[start_state][acList[0]]))
            np.savetxt(ff, N_s_a, delimiter=',')
            ff.write('\n')

        ### Calculating MBAE bounds
        for internal in range(converge_iterations):
            oldQlower = np.copy(QlowerMBAE[start_state])
            for state in range(mdp.numStates):
                for act in range(mdp.numActions):
                    # Calculations for Qupper and Qlower
                    firstterm = R_s_a[state][act]
                    secondterm = mdp.discountFactor * np.sum(
                        VupperMBAE *
                        (N_s_a_sprime[state][act] / N_s_a[state][act]))
                    #secondterm = mdp.discountFactor*sum(Vupper[ss]*N_s_a_sprime[state][act][ss]/N_s_a[state][act] for ss in range(mdp.numStates))
                    lower_secondterm = mdp.discountFactor * np.sum(
                        VlowerMBAE *
                        (N_s_a_sprime[state][act] / N_s_a[state][act]))
                    star_secondterm = mdp.discountFactor * np.sum(
                        Vstar * (N_s_a_sprime[state][act] / N_s_a[state][act]))
                    #lower_secondterm = mdp.discountFactor*sum(Vlower[ss]*N_s_a_sprime[state][act][ss]/N_s_a[state][act] for ss in range(mdp.numStates))
                    thirdterm = mdp.Vmax * math.sqrt(
                        (math.log(c * (samples**2) * mdp.numStates *
                                  mdp.numActions) - math.log(delta)) /
                        N_s_a[state][act])
                    #Qupper[state][act] = (float)(sum(rewards_s_a_sprime[state][act][ss] for ss in range(mdp.numStates))/N_s_a[state][act]) + secondterm + thirdterm
                    QupperMBAE[state][act] = firstterm + secondterm + thirdterm
                    QlowerMBAE[state][
                        act] = firstterm + lower_secondterm - thirdterm
                    Qstar[state][act] = firstterm + star_secondterm
                    # Calculation for Vstar
                    # t = (float)N_s_a_sprime[state][act][stateprime]/N_s_a[state][act]
                    # val = t*(rewards_s_a[state][act][stateprime]+mdp.discountFactor*Vstar[stateprime])
                VupperMBAE[state] = np.amax(QupperMBAE[state])
                VlowerMBAE[state] = np.amax(QlowerMBAE[state])
                Vstar[state] = np.amax(Qstar[state])
            if (np.linalg.norm(oldQlower - QlowerMBAE[start_state]) <=
                    epsilon_convergence):
                # print "Stopping with ", internal, "iterations"
                break

        # if(samples==initial_iterations+2):
        # 	Qupper = np.copy(QupperMBAE)
        # 	Qlower = np.copy(QlowerMBAE)

    return best_policy
Example #5
0
def RoundRobin(mdp, start_state=0, epsilon=4, randomseed=None, delta=0.1):
    global MAX_ITERATION_LIMIT, c
    if (randomseed is not None):
        np.random.seed(randomseed)
    iteration = 0
    it = 0
    initial_iterations = 1 * mdp.numStates * mdp.numActions
    rewards_s_a_sprime = np.zeros(
        (mdp.numStates, mdp.numActions, mdp.numStates))
    R_s_a = np.zeros((mdp.numStates, mdp.numActions))
    sampled_frequency_s_a = np.zeros((mdp.numStates, mdp.numActions))
    N_s_a_sprime = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates))
    P = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates))
    P_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates))
    P_lower_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates))
    VlowerMBAE = np.zeros((mdp.numStates))
    Vlower = np.zeros((mdp.numStates))
    Vstar = (mdp.Vmax / 2) * np.ones((mdp.numStates))
    VupperMBAE = mdp.Vmax * np.ones((mdp.numStates))
    Vupper = mdp.Vmax * np.random.random([mdp.numStates])
    QlowerMBAE = np.zeros((mdp.numStates, mdp.numActions))
    Qlower = np.zeros((mdp.numStates, mdp.numActions))
    Qstar = (mdp.Vmax / 2) * np.ones((mdp.numStates, mdp.numActions))
    QupperMBAE = mdp.Vmax * np.ones((mdp.numStates, mdp.numActions))
    Qupper = mdp.Vmax * np.random.random([mdp.numStates, mdp.numActions])
    final_policy = (-1) * np.ones((mdp.numStates), dtype=np.int)
    states_to_sample = range(mdp.numStates)
    colliding_values = np.zeros((mdp.numStates))
    is_converged = 0

    ### Initial sampling for all state action pairs
    while it < initial_iterations:
        for state in range(mdp.numStates):
            for act in range(mdp.numActions):
                it += 1
                s_prime, r = mdp.simulate(state, act)
                rewards_s_a_sprime[state][act][s_prime] += r
                R_s_a[state][act] = (r + R_s_a[state][act] *
                                     sampled_frequency_s_a[state][act]) / (
                                         sampled_frequency_s_a[state][act] + 1)
                sampled_frequency_s_a[state][act] += 1
                N_s_a_sprime[state][act][s_prime] += 1
                for s2 in range(mdp.numStates):
                    P[state][act][s_prime] = (float)(
                        N_s_a_sprime[state][act]
                        [s_prime]) / sampled_frequency_s_a[state][act]

    ### Calculating V, Q estimates thus far
    for state in range(mdp.numStates):
        for act in range(mdp.numActions):
            # Calculations for QupperMBAE and QlowerMBAE
            firstterm = np.sum(rewards_s_a_sprime[state]
                               [act]) / sampled_frequency_s_a[state][act]
            secondterm = mdp.discountFactor * np.sum(
                VupperMBAE *
                (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act]))
            #secondterm = mdp.discountFactor*sum(VupperMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates))
            lower_secondterm = mdp.discountFactor * np.sum(
                VlowerMBAE *
                (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act]))
            #lower_secondterm = mdp.discountFactor*sum(VlowerMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates))
            thirdterm = mdp.Vmax * math.sqrt(
                (math.log(c * mdp.numStates * mdp.numActions) -
                 math.log(delta)) / sampled_frequency_s_a[state][act])
            #QupperMBAE[state][act] = (float)(sum(rewards_s_a_sprime[state][act][ss] for ss in range(mdp.numStates))/sampled_frequency_s_a[state][act]) + secondterm + thirdterm
            QupperMBAE[state][act] = firstterm + secondterm + thirdterm
            QlowerMBAE[state][act] = firstterm + lower_secondterm - thirdterm

        VupperMBAE[state] = np.amax(QupperMBAE[state])
        VlowerMBAE[state] = np.amax(QlowerMBAE[state])

    Qupper = np.copy(QupperMBAE)
    Qlower = np.copy(QlowerMBAE)

    if (verbose == 0):
        outp = open(mdp.filename + '-rr' + str(randomseed) + '.txt', 'wb')
    ff = open(mdp.filename + '-rr-samples.txt', 'w+')

    while iteration < MAX_ITERATION_LIMIT:
        # print "Sampling state ", max_collision_state[0]
        # print colliding_values
        for state1 in range(mdp.numStates):
            # print "Sampling ", state1, "for this round"
            for act1 in range(mdp.numActions):
                iteration += 1
                sampled_frequency_s_a[state1][act1] += 1

                # Simluate the MDP with this state,action and update counts
                #### TRying 10 continuous simulations
                for t in range(1):
                    s_prime, r = mdp.simulate(state1, act1)
                    rewards_s_a_sprime[state1][act1][s_prime] += r
                    R_s_a[state][act] = (
                        r +
                        R_s_a[state][act] * sampled_frequency_s_a[state][act]
                    ) / (sampled_frequency_s_a[state][act] + 1)
                    N_s_a_sprime[state1][act1][s_prime] += 1
                    for s2 in range(mdp.numStates):
                        P[state1][act][s_prime] = (float)(
                            N_s_a_sprime[state1][act]
                            [s_prime]) / sampled_frequency_s_a[state1][act]

                ## Calculating Q and V values
                for i in range(mdp.numStates):
                    for j in range(mdp.numActions):
                        if (sampled_frequency_s_a[i][j] > 0):
                            P_tilda[i][j] = UpperP(i, j, delta,
                                                   N_s_a_sprime[i][j],
                                                   mdp.numStates, Vupper,
                                                   False)
                            P_lower_tilda[i][j] = LowerP(
                                i, j, delta, N_s_a_sprime[i][j], mdp.numStates,
                                Vlower, False)

                Qupper, Vupper = iteratedConvergence(Qupper, R_s_a, P_tilda,
                                                     mdp.discountFactor,
                                                     epsilon,
                                                     converge_iterations,
                                                     epsilon_convergence)
                Qlower, Vlower = iteratedConvergence(
                    Qlower, R_s_a, P_lower_tilda, mdp.discountFactor, epsilon,
                    converge_iterations, epsilon_convergence)

                # Calculations for QupperMBAE and QlowerMBAE
                #### This involved a two for-loop and iterating convergence
                for state in range(mdp.numStates):
                    for act in range(mdp.numActions):
                        # Calculations for QupperMBAE and QlowerMBAE
                        # Calculations for QupperMBAE and QlowerMBAE
                        firstterm = np.sum(
                            rewards_s_a_sprime[state]
                            [act]) / sampled_frequency_s_a[state][act]
                        secondterm = mdp.discountFactor * np.sum(
                            VupperMBAE * (N_s_a_sprime[state][act] /
                                          sampled_frequency_s_a[state][act]))
                        #secondterm = mdp.discountFactor*sum(VupperMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates))
                        lower_secondterm = mdp.discountFactor * np.sum(
                            VlowerMBAE * (N_s_a_sprime[state][act] /
                                          sampled_frequency_s_a[state][act]))
                        star_secondterm = mdp.discountFactor * np.sum(
                            Vstar * (N_s_a_sprime[state][act] /
                                     sampled_frequency_s_a[state][act]))
                        #lower_secondterm = mdp.discountFactor*sum(VlowerMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates))
                        thirdterm = mdp.Vmax * math.sqrt(
                            (math.log(c * (iteration**2) * mdp.numStates *
                                      mdp.numActions) - math.log(delta)) /
                            sampled_frequency_s_a[state][act])
                        #QupperMBAE[state][act] = (float)(sum(rewards_s_a_sprime[state][act][ss] for ss in range(mdp.numStates))/sampled_frequency_s_a[state][act]) + secondterm + thirdterm
                        QupperMBAE[state][
                            act] = firstterm + secondterm + thirdterm
                        QlowerMBAE[state][
                            act] = firstterm + lower_secondterm - thirdterm
                        Qstar[state][act] = firstterm + star_secondterm
                        # Calculation for Vstar
                        # t = (float)N_s_a_sprime[state][act][stateprime]/sampled_frequency_s_a[state][act]
                        # val = t*(rewards_s_a[state][act][stateprime]+mdp.discountFactor*Vstar[stateprime])
                    VupperMBAE[state] = np.amax(QupperMBAE[state])
                    VlowerMBAE[state] = np.amax(QlowerMBAE[state])
                    Vstar[state] = np.amax(Qstar[state])

        count = 0
        # print iteration, (QupperMBAE[start_state][acList[1]]-QlowerMBAE[start_state][acList[0]])/epsilon, sampled_frequency_s_a
        if (iteration % 100 == 0):
            for i in range(mdp.numStates):
                if (final_policy[i] == -1):
                    final_policy[i] = bestTwoActions(mdp, i, QlowerMBAE,
                                                     QupperMBAE, Qstar)[0]
            acList = bestTwoActions(mdp, start_state, Qlower, Qupper, Qstar)
            if (verbose == 0):
                outp.write(str(iteration))
                outp.write('\t')
                outp.write(
                    str(QupperMBAE[start_state][acList[1]] -
                        QlowerMBAE[start_state][acList[0]])
                )  #-epsilon*(1-mdp.discountFactor)/2
                # print(str(QupperMBAE[start_state][acList[1]]-QlowerMBAE[start_state][acList[0]]))
                # print(iteration, QupperMBAE[start_state])
                # outp.write(str(evaluatePolicy(mdp, final_policy, start_state)))
                print(str(evaluatePolicy(mdp, final_policy, start_state)))
                outp.write('\n')
            else:
                print(iteration, (QupperMBAE[start_state][acList[1]] -
                                  QlowerMBAE[start_state][acList[0]]))
                # print iteration, (Qupper[start_state][acList[1]]-Qlower[start_state][acList[0]])

            np.savetxt(ff, sampled_frequency_s_a, delimiter=',')
            ff.write('\n')
            # print iteration

        #### Check epsilon condition for only starting state
        acList = bestTwoActions(mdp, start_state, Qlower, Qupper, Qstar)
        if (QupperMBAE[start_state][acList[1]] -
                QlowerMBAE[start_state][acList[0]] < epsilon *
            (1 - mdp.discountFactor) / 2 and iteration > 50):
            print(
                QupperMBAE[start_state][acList[1]] -
                QlowerMBAE[start_state][acList[0]], "<",
                epsilon * (1 - mdp.discountFactor) / 2)
            # if(count==mdp.numStates):
            acList = bestTwoActions(mdp, start_state, QlowerMBAE, QupperMBAE,
                                    Qstar)
            a = open('final' + mdp.filename + '-rr.txt', 'a+')
            a.write(str(iteration) + '\n')
            a.close()
            print("Setting final_policy of ", start_state, " to", acList[0])
            final_policy[start_state] = acList[0]
            print("Iterations taken : ", iteration)
            print("Returning the policy :", final_policy)
            for i in range(mdp.numStates):
                if (final_policy[i] == -1):
                    final_policy[i] = bestTwoActions(mdp, i, QlowerMBAE,
                                                     QupperMBAE, Qstar)[0]
            return final_policy

    for i in range(mdp.numStates):
        if (final_policy[i] == -1):
            final_policy[i] = bestTwoActions(mdp, i, QlowerMBAE, QupperMBAE,
                                             Qstar)[0]
    return final_policy
def LUCBBound(mdp, start_state=0, epsilon=4, delta=0.1, fileprint=1):
    global MAX_ITERATION_LIMIT, c
    iteration = 0
    it = 0
    H = int((math.log(mdp.Vmax) + math.log(6.0 / epsilon)) /
            (1 - mdp.discountFactor))
    initial_iterations = 1 * mdp.numStates * mdp.numActions
    rewards_s_a_sprime = np.zeros(
        (mdp.numStates, mdp.numActions, mdp.numStates))
    R_s_a = np.zeros((mdp.numStates, mdp.numActions))
    sampled_frequency_s_a = np.zeros((mdp.numStates, mdp.numActions))
    N_s_a_sprime = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates))
    P = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates))
    P_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates))
    P_lower_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates))
    VlowerMBAE = np.zeros((mdp.numStates))
    Vlower = np.zeros((mdp.numStates))
    Vstar = (mdp.Vmax / 2) * np.ones((mdp.numStates))
    Vupper = mdp.Vmax * np.ones((mdp.numStates))
    Qlower = np.zeros((mdp.numStates, mdp.numActions))
    VupperMBAE = mdp.Vmax * np.ones((mdp.numStates))
    QlowerMBAE = np.zeros((mdp.numStates, mdp.numActions))
    Qstar = (mdp.Vmax / 2) * np.ones((mdp.numStates, mdp.numActions))
    QupperMBAE = mdp.Vmax * np.ones((mdp.numStates, mdp.numActions))
    Qupper = mdp.Vmax * np.ones((mdp.numStates, mdp.numActions))
    final_policy = (-1) * np.ones((mdp.numStates), dtype=np.int)
    states_to_sample = range(mdp.numStates)
    colliding_values = np.zeros((mdp.numStates))
    converge_iterations = 10000
    epsilon_convergence = 1e-4
    is_converged = 0
    print "Vmax", mdp.Vmax

    ### Initial sampling for all state action pairs
    while it < initial_iterations:
        for state in range(mdp.numStates):
            for act in range(mdp.numActions):
                it += 1
                s_prime, r = mdp.simulate(state, act)
                rewards_s_a_sprime[state][act][s_prime] += r
                R_s_a[state][act] = (r + R_s_a[state][act] *
                                     sampled_frequency_s_a[state][act]) / (
                                         sampled_frequency_s_a[state][act] + 1)
                sampled_frequency_s_a[state][act] += 1
                N_s_a_sprime[state][act][s_prime] += 1

    ### Calculating V, Q estimates thus far MBAE
    for internal in range(converge_iterations):
        oldQlowerMBAE = np.copy(QlowerMBAE[start_state])
        for state in range(mdp.numStates):
            for act in range(mdp.numActions):
                # Calculations for QupperMBAE and QlowerMBAE
                firstterm = np.sum(rewards_s_a_sprime[state]
                                   [act]) / sampled_frequency_s_a[state][act]
                secondterm = mdp.discountFactor * np.sum(
                    VupperMBAE * (N_s_a_sprime[state][act] /
                                  sampled_frequency_s_a[state][act]))
                #secondterm = mdp.discountFactor*sum(VupperMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates))
                lower_secondterm = mdp.discountFactor * np.sum(
                    VlowerMBAE * (N_s_a_sprime[state][act] /
                                  sampled_frequency_s_a[state][act]))
                #lower_secondterm = mdp.discountFactor*sum(VlowerMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates))
                thirdterm = mdp.Vmax * math.sqrt(
                    (math.log(c * mdp.numStates * mdp.numActions) -
                     math.log(delta)) / sampled_frequency_s_a[state][act])
                #QupperMBAE[state][act] = (float)(sum(rewards_s_a_sprime[state][act][ss] for ss in range(mdp.numStates))/sampled_frequency_s_a[state][act]) + secondterm + thirdterm
                QupperMBAE[state][act] = firstterm + secondterm + thirdterm
                QlowerMBAE[state][
                    act] = firstterm + lower_secondterm - thirdterm

                # Calculation for Vstar
                # t = (float)N_s_a_sprime[state][act][stateprime]/sampled_frequency_s_a[state][act]
                # val = t*(rewards_s_a[state][act][stateprime]+mdp.discountFactor*Vstar[stateprime])

                # if(state==start_state and abs(VupperMBAE[state]-QupperMBAEmax)<epsilon_convergence):
                # 	VupperMBAE[state] = QupperMBAEmax
                # 	print "Stopping with ", internal, "initial internal iterations"
                # 	is_converged = 1
                # 	break
            VupperMBAE[state] = np.amax(QupperMBAE[state])
            VlowerMBAE[state] = np.amax(QlowerMBAE[state])

        if (np.linalg.norm(oldQlowerMBAE - QlowerMBAE[start_state]) <=
                epsilon_convergence):
            print "Stopping with ", internal, "initial internal iterations"
            break

    if internal == converge_iterations:
        print "Used all iterations"

    print "Initial estimate of QupperMBAE found! Now sampling"

    sys.stdout = open(mdp.filename + '-lucbbound.txt', 'w+')
    ff = open(mdp.filename + '-lucbbound-samples.txt', 'w+')

    h = 0
    state1 = start_state

    while iteration < MAX_ITERATION_LIMIT:
        max_collision_state = [
            sorted(states_to_sample,
                   key=lambda x: colliding_values[x],
                   reverse=True)[0]
        ]
        # print "Sampling state ", max_collision_state[0]
        # print colliding_values
        # print "Sampling ", state1, "for this round"
        if (h % H == 0):
            state1 = start_state
            h = 0
        else:
            state1 = nextstate
        actionsList = bestTwoActions(mdp, state1, Qlower, Qupper, Qstar)
        a = np.random.choice(actionsList)
        iteration += 1
        sampled_frequency_s_a[state1][a] += 1
        for t in range(1):
            s_prime, r = mdp.simulate(state1, a)
            nextstate = s_prime
            rewards_s_a_sprime[state1][a][s_prime] += r
            R_s_a[state][act] = (
                r + R_s_a[state][act] * sampled_frequency_s_a[state][act]) / (
                    sampled_frequency_s_a[state][act] + 1)
            N_s_a_sprime[state1][a][s_prime] += 1

        ## Calculating Q and V values
        for i in range(mdp.numStates):
            for j in range(mdp.numActions):
                if (sampled_frequency_s_a[i][j] > 0):
                    P_tilda[i][j] = UpperP(i, j, delta, N_s_a_sprime[i][j],
                                           mdp.numStates, Vupper, False)
                    P_lower_tilda[i][j] = LowerP(i, j, delta,
                                                 N_s_a_sprime[i][j],
                                                 mdp.numStates, Vupper, False)

        Qupper, Vupper = iteratedConvergence(Qupper, R_s_a, P_tilda,
                                             mdp.discountFactor, epsilon,
                                             converge_iterations,
                                             epsilon_convergence)
        Qlower, Vlower = iteratedConvergence(Qlower, R_s_a, P_lower_tilda,
                                             mdp.discountFactor, epsilon,
                                             converge_iterations,
                                             epsilon_convergence)

        # Calculations for QupperMBAE and QlowerMBAE
        #### This involved a two for-loop and iterating convergence
        for internal in range(converge_iterations):
            oldQlowerMBAE = np.copy(QlowerMBAE[start_state])
            for state in range(mdp.numStates):
                for act in range(mdp.numActions):
                    # Calculations for QupperMBAE and QlowerMBAE
                    # Calculations for QupperMBAE and QlowerMBAE
                    firstterm = np.sum(rewards_s_a_sprime[state][act]
                                       ) / sampled_frequency_s_a[state][act]
                    secondterm = mdp.discountFactor * np.sum(
                        VupperMBAE * (N_s_a_sprime[state][act] /
                                      sampled_frequency_s_a[state][act]))
                    #secondterm = mdp.discountFactor*sum(VupperMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates))
                    lower_secondterm = mdp.discountFactor * np.sum(
                        VlowerMBAE * (N_s_a_sprime[state][act] /
                                      sampled_frequency_s_a[state][act]))
                    star_secondterm = mdp.discountFactor * np.sum(
                        Vstar * (N_s_a_sprime[state][act] /
                                 sampled_frequency_s_a[state][act]))
                    #lower_secondterm = mdp.discountFactor*sum(VlowerMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates))
                    thirdterm = mdp.Vmax * math.sqrt(
                        (math.log(c * (iteration**2) * mdp.numStates *
                                  mdp.numActions) - math.log(delta)) /
                        sampled_frequency_s_a[state][act])
                    #QupperMBAE[state][act] = (float)(sum(rewards_s_a_sprime[state][act][ss] for ss in range(mdp.numStates))/sampled_frequency_s_a[state][act]) + secondterm + thirdterm
                    QupperMBAE[state][act] = firstterm + secondterm + thirdterm
                    QlowerMBAE[state][
                        act] = firstterm + lower_secondterm - thirdterm
                    Qstar[state][act] = firstterm + star_secondterm
                    # Calculation for Vstar
                    # t = (float)N_s_a_sprime[state][act][stateprime]/sampled_frequency_s_a[state][act]
                    # val = t*(rewards_s_a[state][act][stateprime]+mdp.discountFactor*Vstar[stateprime])
                VupperMBAE[state] = np.amax(QupperMBAE[state])
                VlowerMBAE[state] = np.amax(QlowerMBAE[state])
                Vstar[state] = np.amax(Qstar[state])
            if (np.linalg.norm(oldQlowerMBAE - QlowerMBAE[start_state]) <=
                    epsilon_convergence):
                # print "Stopping with ", internal, "iterations"
                break

        count = 0
        if (iteration % 10000 == 0):
            print iteration, (QupperMBAE[start_state][acList[1]] -
                              QlowerMBAE[start_state][acList[0]]) / epsilon
            np.savetxt(ff, sampled_frequency_s_a, delimiter=',')
            ff.write('\n')
            # print QupperMBAE
            # print iteration

        #### Check epsilon condition for all the states
        # for st in range(mdp.numStates):
        # 	acList = bestTwoActions(mdp, st, Qstar, QupperMBAE, Qstar)
        # 	# print "Comparing ",QupperMBAE[st][acList[1]], QlowerMBAE[st][acList[0]]

        # 	if(QupperMBAE[st][acList[1]]-QlowerMBAE[st][acList[0]]<=epsilon):
        # 		# print "Setting action ", acList[0], "for state ", st
        # 		final_policy[st]=acList[0]
        # 		count+=1

        ##### Updating the list of coliliding states
        states_to_sample = []
        for st in range(mdp.numStates):
            acList = bestTwoActions(mdp, st, Qlower, Qupper, Qstar)
            # colliding_values[st] = QupperMBAE[st][acList[1]]-QlowerMBAE[st][acList[0]]-epsilon
            ##### Changing stopping condition to epsilon*(1-gamma)/2
            colliding_values[st] = Qupper[st][acList[1]] - Qlower[st][
                acList[0]] - epsilon * (1 - mdp.discountFactor) / 2
            # print colliding_values[st]
            if (colliding_values[st] > 0):
                ### this state is still colliding, add to sample states
                states_to_sample.append(st)

        #### Check epsilon condition for only starting state
        if (not (start_state in states_to_sample)):
            # if(count==mdp.numStates):
            acList = bestTwoActions(mdp, start_state, Qlower, Qupper, Qstar)
            print "Setting final_policy of ", start_state, " to", acList[0]
            final_policy[start_state] = acList[0]
            print "Iterations taken : ", iteration
            print "Returning the policy :", final_policy
            for i in range(mdp.numStates):
                if (final_policy[i] == -1):
                    final_policy[i] = bestTwoActions(mdp, i, Qlower, Qupper,
                                                     Qstar)[0]
            return final_policy

        h += 1

    for i in range(mdp.numStates):
        if (final_policy[i] == -1):
            final_policy[i] = bestTwoActions(mdp, i, Qlower, Qupper, Qstar)[0]
    return final_policy