Beispiel #1
0
def valueiteration(v, reward, a):
    #    a = [[None, None, None], [None, None, None],  [None, None, None]]
    # initializing a dummy last value matrix  to enter Error loop
    #    vlast = [[1, 0, 0], [0, 0, 0], [0, 0, 0]]
    #    ipdb.set_trace()
    vlast = generateDummy(v)
    print(error(v, vlast))
    while error(v, vlast) >= 10**(-5):
        vlast = deepcopy(v)  # copying current V in Vlast
        m = np.shape(v)  # size of value matrix
        for i in range(0, m[0]):  # Nos. of Raws
            for j in range(0, m[1]):  # Nos. of Columns
                for k in range(0, 4):  # Nos. of actions
                    if k == 0 and i > 0:  # Upper movement for all Raws -1st Raw
                        temp0 = reward[i][j][k] + gama * v[i - 1][j]
                    elif k == 0 and i == 0:  # Upper movement for 1st Raw (All Cols)
                        temp0 = None
                    if k == 1 and i < m[
                            0] - 1:  # Down movement for all raws - last Raw
                        temp1 = reward[i][j][k] + gama * v[i + 1][j]
                    elif k == 1 and i == m[
                            0] - 1:  # Down movement for last Raw(All Cols)
                        temp1 = None
                    if k == 2 and j > 0:  # Left movement for all Cols - 1st  Cols
                        temp2 = reward[i][j][k] + gama * v[i][j - 1]
                    elif k == 2 and j == 0:  # Left movement for 1st Cols(all raws)
                        temp2 = None
                    if k == 3 and j < m[
                            1] - 1:  # Right movement for all Cols - last  Cols
                        temp3 = reward[i][j][k] + gama * v[i][j + 1]
                    elif k == 3 and j == m[
                            1] - 1:  # Right movement for last Cols(all raws)
                        temp3 = None
                v[i][j] = max(temp0, temp1, temp2,
                              temp3)  # taking max of all actions
                # a stores which action is taken to get the value
                if v[i][j] == temp0:
                    a[i][j] = 0
                if v[i][j] == temp1:
                    a[i][j] = 1
                if v[i][j] == temp2:
                    a[i][j] = 2
                if v[i][j] == temp3:
                    a[i][j] = 3
                if v[i][j] == None:
                    v[i][j] = 0


#    ipdb.set_trace()
    return v, a
def qLearning(n, p, p1, encoder, ENClast):
    import qinitial
    v = initvalact.initvalact(n)
    Q = qinitial.qinitial(n)
    #Q = [[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]] 
    a = v[1]
    # a = [[None, None, None], [None, None, None],  [None, None, None]]  # initializing action matrix
    # size = reading size of the given Q matrix [Nos of raws, Nos. of col, Nos. of actions possible per state]
    size = np.shape(Q)  # storing size of Q-matrix
    n = size[0]
    Qlast = generateDummy(Q)  # generating dummy of same sizq as Q to enter the while loop
    iteration = 0  # initializing the iteration
    reward = generate_rewardmatrix.generate_rewardmatrix(n)
    while qError(Q, Qlast) > 1.5 or Q == Qlast:  # check for the error value to be 10**-3 or Q = Qlast
        # we want here Q!=Qlast becauses in starting phase if reward is zero in next step we will read error = 0
        # and this will cause us to fall out of the loop
        iteration += 1  # incresing iteration value
        Qlast = deepcopy(Q)  # copying Q to Qlast
        # state =  selecting state randomly 1every time depending on the Q size
        state = random.randint(1, size[0] * size[1])
        # temp = to retrive raw and column from Nos of state generated by random selector
        # state / Nos.of column will give us information about the raw number...
        # for 3x4 (raw x column) state 1 to 4 are raw in 1 and state 5 to 8 are raw in 2
        # for raw1(state 1 to 4)/4 (total columns) will be 0 < temp <= 1
        # for raw1(state 5 to 8)/4 (total columns) will be 1 < temp <= 2
        temp = state / (size[1] * 1.0)  # defining a temporary variable
        if ((temp).is_integer()):
            raw = int(temp) - 1
        else:
            raw = int(temp)
        # temp = modulo of state and Total column
        # for column1(state 1,5,9) % 4 (total columns) will be 1 [i.e colum = 1-1 = 0]
        # for column1(state 2,6,10) % 4 (total columns) will be 2 [i.e colum = 2-1 = 1]
        temp = state % size[1]
        col = temp - 1
        if col < 0:
            col = size[1] - 1
        else:
            pass
        gotopos.gotopos(raw, col, p, p1, n)  # to go to state that is selected randomly
        time.sleep(0.3)
	# ipdb.set_trace()
        for i in range(0, 20):
            # action selection according to selction of state
            if raw == 0 and col == 0:
                action = random.choice([1, 3])
            elif raw == 0 and (col == -1 or col == size[1]-1):
                action = random.choice([1, 2])
            elif raw == 0:
                action = random.choice([1, 2, 3])

            elif raw == size[0]-1 and col == 0:
                action = random.choice([0, 3])
            elif raw == size[0]-1 and (col == -1 or col == size[1]-1):
                action = random.choice([0, 2])
            elif raw == size[0]-1:
                action = random.choice([0, 2, 3])

            elif col == 0:
                action = random.choice([0, 1, 3])
            elif (col == -1 or col == size[1]-1):
                action = random.choice([0, 1, 2])

            else:
                action = random.randint(0, 3)  # cells where all four actions are possible

            # defining nextstate according to choosen action
            if action == 0:  # Up movement
                nextstate = Q[raw-1][col]
                rawtemp = raw - 1  # raw of nextstep
                coltemp = col  # col of nextstep
            elif action == 1:  # Down movememt
                nextstate = Q[raw+1][col]
                rawtemp = raw + 1  # raw of nextstep
                coltemp = col  # col of nextstep
            elif action == 2:  # Left movement
                nextstate = Q[raw][col-1]
                rawtemp = raw  # raw of nextstep
                coltemp = col - 1  # col of nextstep
            else:  # Right movement
                # ipdb.set_trace()
                nextstate = Q[raw][col+1]
                rawtemp = raw  # raw of nextstep
                coltemp = col + 1  # col of nextstep
            # ipdb.set_trace()
            # try executing the Q-iteration formula with no errors..
            '''
            _____ADD HERE____
            ACTION_PERFORMANCE FUNCTION
            UPDATE_REWARD FUNCTION
            '''
            ENClast = encoder.getData()
            act.playAction(action, raw, col, size[0], p, p1)
            time.sleep(0.1)
            if action == 0 or action == 1:
                ENClast = encoder.getData()
            ENC = encoder.getData()
            reward[raw][col][action] = ENC - ENClast
            # update_reward.update_reward(reward, raw, col, action, diff)

            try:
                Q[raw][col][action] = reward[raw][col][action] + gama * (max(nextstate))
		#print "Q", Q 
            # tracking if there is a type error (i.e. datatype missmatch) or not in above equation
            except TypeError as e:
                print("TypeError")
            raw = rawtemp
            col = coltemp
        print "qerror is", qError(Q,Qlast)
        print "reward is", reward
    # getting the appropriate action back from the given calculated values of Q matrix
    for r in range(0, size[0]):
        for c in range(0, size[1]):
            # ipdb.set_trace()
            a[r][c] = Q[r][c].index(max(Q[r][c]))
    # ipdb.set_trace()
    # function returns Q matrix, action matrix and nos of iteration
    return Q, a, iteration
def qLearning(n, reward1):
    v = initvalact.initvalact(n)
    Q = qinitial.qinitial(n)
    kMatrix = qinitial.qinitial(n)
    bita = 1 / 2
    # Q = [[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]
    a = v[1]
    # a = [[None, None, None], [None, None, None],  [None, None, None]]  # initializing action matrix
    # size = reading size of the given Q matrix [Nos of raws, Nos. of col, Nos. of actions possible per state]
    size = np.shape(Q)  # storing size of Q-matrix
    n = size[0]
    Qlast = generateDummy(
        Q)  # generating dummy of same sizq as Q to enter the while loop
    iteration = 0  # initializing the iteration
    reward = generate_rewardmatrix.generate_rewardmatrix(n)
    # check for the error value to be 10**-3 or Q = Qlast
    ipdb.set_trace()
    while qError(Q, Qlast) > 1.5 or Q == Qlast or iteration <= 12:
        # we want here Q!=Qlast becauses in starting phase if reward is zero in next step we will read error = 0
        # and this will cause us to fall out of the loop
        iteration += 1  # incresing iteration value
        Qlast = deepcopy(Q)  # copying Q to Qlast
        # state =  selecting state randomly every time depending on the Q size
        state = random.randint(1, size[0] * size[1])
        # temp = to retrive raw and column from Nos of state generated by random selector
        # state / Nos.of column will give us information about the raw number...
        # for 3x4 (raw x column) state 1 to 4 are raw in 1 and state 5 to 8 are raw in 2
        # for raw1(state 1 to 4)/4 (total columns) will be 0 < temp <= 1
        # for raw1(state 5 to 8)/4 (total columns) will be 1 < temp <= 2
        temp = state / (size[1] * 1.0)  # defining a temporary variable
        if ((temp).is_integer()):
            raw = int(temp) - 1
        else:
            raw = int(temp)
        # temp = modulo of state and Total column
        # for column1(state 1,5,9) % 4 (total columns) will be 1 [i.e colum = 1-1 = 0]
        # for column1(state 2,6,10) % 4 (total columns) will be 2 [i.e colum = 2-1 = 1]
        temp = state % size[1]
        col = temp - 1
        if col < 0:
            col = size[1] - 1
        else:
            pass
        # ipdb.set_trace()
        NumOfSelAct = 100 * (1 - 1 / (np.exp(0.05 * (iteration - 2))))
        NumOfSelAct = round(NumOfSelAct * 25 / 100)
        print NumOfSelAct

        for i in range(0, 20):
            # action selection according to selction of state
            if i < NumOfSelAct:
                possibleActions = action_select(raw, col, n)
                tempList = []
                for i in possibleActions:
                    tempList.append(Q[raw][col][i])
                action = possibleActions[tempList.index(max(tempList))]
            else:
                possibleActions = action_select(raw, col, n)
                action = random.choice(possibleActions)

            # defining nextstate according to choosen action
            if action == 0:  # Up movement
                nextstate = Q[raw - 1][col]
                rawtemp = raw - 1  # raw of nextstep
                coltemp = col  # col of nextstep
            elif action == 1:  # Down movememt
                nextstate = Q[raw + 1][col]
                rawtemp = raw + 1  # raw of nextstep
                coltemp = col  # col of nextstep
            elif action == 2:  # Left movement
                nextstate = Q[raw][col - 1]
                rawtemp = raw  # raw of nextstep
                coltemp = col - 1  # col of nextstep
            else:  # Right movement
                # ipdb.set_trace()
                nextstate = Q[raw][col + 1]
                rawtemp = raw  # raw of nextstep
                coltemp = col + 1  # col of nextstep
            # ipdb.set_trace()
            # try executing the Q-iteration formula with no errors..
            '''
            _____ADD HERE____
            ACTION_PERFORMANCE FUNCTION
            UPDATE_REWARD FUNCTION
            '''
            reward[raw][col][action] = reward1[raw][col][action]
            kMatrix[raw][col][action] = kMatrix[raw][col][action] + 1
            try:
                alpha = 1 / ((kMatrix[raw][col][action])**bita)
                Q[raw][col][action] = (1-alpha) * Q[raw][col][action] + \
                    alpha * (reward[raw][col][action] + gama * (max(nextstate)))
            # tracking if there is a type error (i.e. datatype missmatch) or not in above equation
            except TypeError as e:
                print("TypeError")
                ipdb.set_trace()
            raw = rawtemp
            col = coltemp

    # getting the appropriate action back from the given calculated values of Q matrix
    for r in range(0, size[0]):
        for c in range(0, size[1]):
            # ipdb.set_trace()
            possibleActions = action_select(r, c, n)
            tempList = []
            for i in possibleActions:
                tempList.append(Q[r][c][i])
            a[r][c] = possibleActions[tempList.index(max(tempList))]
    # ipdb.set_trace()
    # function returns Q matrix, action matrix and nos of iteration
    print kMatrix
    print NumOfSelAct
    return Q, a, iteration
Beispiel #4
0
def qLearning(n):
    simulatedRewards = [
        [
            {
                0: None,
                1: 0,
                2: None,
                3: 0
            },  # state = 1
            {
                0: None,
                1: 0,
                2: 0,
                3: 0
            },  # State = 2
            {
                0: None,
                1: 0,
                2: 0,
                3: None
            }
        ],  # State = 3
        [
            {
                0: 0,
                1: 0,
                2: None,
                3: 0
            },  # State = 4
            {
                0: 0,
                1: 0,
                2: 0,
                3: 0
            },  # State = 5
            {
                0: 0,
                1: 0,
                2: 0,
                3: None
            }
        ],  # State = 6
        [
            {
                0: 0,
                1: None,
                2: None,
                3: -1
            },  # State = 7
            {
                0: 0,
                1: None,
                2: 1,
                3: -1
            },  # State = 8
            {
                0: 0,
                1: None,
                2: 1,
                3: None
            }
        ]
    ]  # State = 9
    v = initvalact.initvalact(n)
    Q = qinitial.qinitial(n)
    '''Q = [[[0, 1, 0, 0], [0, 1, 0, 0], [0, 1, 0, 0]],  # State1,State2, Stete3
         [[0, 1, 0, 0], [0, 1, 0, 0], [0, 1, 0, 0]],  # State4, State5, State6
         [[1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0]]]  # State7, State8, State9'''
    kMatrix = qinitial.qinitial(n)
    Tr = qinitial.qinitial(n)
    a = v[1]
    size = np.shape(Q)  # storing size of Q-matrix
    n = size[0]
    Qlast = generateDummy(
        Q)  # generating dummy of same sizq as Q to enter the while loop
    iteration = 0  # initializing the iteration
    reward = generate_rewardmatrix.generate_rewardmatrix(n)
    state = random.randint(1, size[0] * size[1])
    while qError(Q, Qlast) > 1.5 or Q == Qlast or iteration <= 100:
        iteration += 1  # incresing iteration value
        Qlast = deepcopy(Q)  # copying Q to Qlast
        # state =  selecting state randomly 1every time depending on the Q size
        # temp = to retrive raw and column from Nos of state generated by random selector
        # state / Nos.of column will give us information about the raw number...
        # for 3x4 (raw x column) state 1 to 4 are raw in 1 and state 5 to 8 are raw in 2
        # for raw1(state 1 to 4)/4 (total columns) will be 0 < temp <= 1
        # for raw1(state 5 to 8)/4 (total columns) will be 1 < temp <= 2
        if iteration == 1:
            temp = state / (size[1] * 1.0)  # defining a temporary variable
            if ((temp).is_integer()):
                raw = int(temp) - 1
            else:
                raw = int(temp)
            # temp = modulo of state and Total column
            # for column1(state 1,5,9) % 4 (total columns) will be 1 [i.e colum = 1-1 = 0]
            # for column1(state 2,6,10) % 4 (total columns) will be 2 [i.e colum = 2-1 = 1]
            temp = state % size[1]
            col = temp - 1
            if col < 0:
                col = size[1] - 1
            else:
                pass
# ***        gotopos.gotopos(raw, col, p, p1, n)  # to go to state that is selected randomly
# ***        time.sleep(0.3)
# ipdb.set_trace()
        possibleActions = action_select(raw, col, n)
        # ipdb.set_trace()
        probablity = epsilon_greedy_policy(Q[raw][col], possibleActions,
                                           epsilon)
        # ipdb.set_trace()
        actionIndex = np.random.choice(len(probablity), p=probablity)
        # ipdb.set_trace()
        action = possibleActions[actionIndex]
        # ipdb.set_trace()

        #    for i in range(0, 20):
        # action selection according to selction of state
        '''
        -------------------------------------------------------------
        ***************REPLACED BY EPSILON GREDDY POLICY*************
        -------------------------------------------------------------
        if i < NumOfSelAct:
            possibleActions = action_select(raw, col, n)
            tempList = []
            for i in possibleActions:
                tempList.append(Q[raw][col][i])
            action = possibleActions[tempList.index(max(tempList))]
        else:
            possibleActions = action_select(raw, col, n)
            action = random.choice(possibleActions)
        -------------------------------------------------------------
        ***************REPLACED BY EPSILON GREDDY POLICY*************
        -------------------------------------------------------------
        '''

        # defining nextstate according to choosen action
        if action == 0:  # Up movement
            nextstate = Q[raw - 1][col]
            rawtemp = raw - 1  # raw of nextstep
            coltemp = col  # col of nextstep
        elif action == 1:  # Down movememt
            nextstate = Q[raw + 1][col]
            rawtemp = raw + 1  # raw of nextstep
            coltemp = col  # col of nextstep
        elif action == 2:  # Left movement
            nextstate = Q[raw][col - 1]
            rawtemp = raw  # raw of nextstep
            coltemp = col - 1  # col of nextstep
        else:  # Right movement
            # ipdb.set_trace()
            nextstate = Q[raw][col + 1]
            rawtemp = raw  # raw of nextstep
            coltemp = col + 1  # col of nextstep
        # ipdb.set_trace()
        # try executing the Q-iteration formula with no errors..
        '''
        _____ADD HERE____
        ACTION_PERFORMANCE FUNCTION
        UPDATE_REWARD FUNCTION
        ENClast= encoder.getData()
        act.playAction(action, raw, col, size[0], p, p1)
        time.sleep(0.1)
#         if action == 0 or action == 1:
#            ENClast= encoder.getData()
        ENC= encoder.getData()
        reward[raw][col][action]= ENC - ENClast
        # update_reward.update_reward(reward, raw, col, action, diff)
        '''
        # ipdb.set_trace()
        reward[raw][col][action] = simulatedRewards[raw][col][action]
        # ipdb.set_trace()
        kMatrix[raw][col][action] = kMatrix[raw][col][action] + 1
        # ipdb.set_trace()
        try:
            alpha = 1 / ((kMatrix[raw][col][action])**bita)
            # ipdb.set_trace()
            eComplement = reward[raw][col][action] + gama * max(
                nextstate) - Q[raw][col][action]
            # ipdb.set_trace()
            e = reward[raw][col][action] + gama * max(nextstate) - max(
                Q[raw][col])
            # ipdb.set_trace()
            for r in range(size[0]):
                for c in range(size[1]):
                    for actn in action_select(raw, col, n):
                        Tr[r][c][actn] = gama * lamda * Tr[r][c][actn]
                        Q[r][c][
                            actn] = Q[r][c][actn] + alpha * Tr[r][c][actn] * e
            # ipdb.set_trace()
            Q[raw][col][action] = Q[raw][col][action] + alpha * eComplement
            # ipdb.set_trace()
            Tr[raw][col][action] += 1
            # ipdb.set_trace()
            # print "Q", Q
        # tracking if there is a type error (i.e. datatype missmatch) or not in above equation
        except TypeError as e:
            print("TypeError")
        # ipdb.set_trace()
        print possibleActions
        print probablity
        print "raw= ", raw, "col = ", col, "action = ", action
        raw = rawtemp
        col = coltemp
        print "qerror is", qError(Q, Qlast)
        print "reward is", reward
        print "iteration = ", iteration
        # time.sleep(0.1)
        # ipdb.set_trace()
    # getting the appropriate action back from the given calculated values of Q matrix
    print Tr
    print Q
    for r in range(0, size[0]):
        for c in range(0, size[1]):
            possibleActions = action_select(r, c, n)
            tempList = []
            for i in possibleActions:
                tempList.append(Q[r][c][i])
            a[r][c] = possibleActions[tempList.index(max(tempList))]
    # ipdb.set_trace()
    # function returns Q matrix, action matrix and nos of iteration
    return Q, a, iteration
Beispiel #5
0
def qLearning(n, p, p1, encoder, ENClast):
    v = initvalact.initvalact(n)
    Q = qinitial.qinitial(n)
    kMatrix = qinitial.qinitial(n)
    restriCount = qinitial.qinitial(n)
    bita = 1/2
    # Q = [[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]
    a = v[1]
    # a = [[None, None, None], [None, None, None],  [None, None, None]]  # initializing action matrix
    # size = reading size of the given Q matrix [Nos of raws, Nos. of col, Nos. of actions possible per state]
    size = np.shape(Q)  # storing size of Q-matrix
    n = size[0]
    Qlast = generateDummy(Q)  # generating dummy of same sizq as Q to enter the while loop
    iteration = 0  # initializing the iteration
    reward = generate_rewardmatrix.generate_rewardmatrix(n)
    # check for the error value to be 10**-3 or Q = Qlast
    global val1
    val1 = pinSetup.valueRead_ON()
    while (qError(Q, Qlast) > 1.5 or Q == Qlast or iteration <= 4*n) and (val1 == 0):
        # we want here Q!=Qlast becauses in starting phase if reward is zero in next step we will read error = 0
        # and this will cause us to fall out of the loop
        iteration += 1  # incresing iteration value
        Qlast = deepcopy(Q)  # copying Q to Qlast
        # state =  selecting state randomly 1every time depending on the Q size
        state = random.randint(1, size[0] * size[1])
        # temp = to retrive raw and column from Nos of state generated by random selector
        # state / Nos.of column will give us information about the raw number...
        # for 3x4 (raw x column) state 1 to 4 are raw in 1 and state 5 to 8 are raw in 2
        # for raw1(state 1 to 4)/4 (total columns) will be 0 < temp <= 1
        # for raw1(state 5 to 8)/4 (total columns) will be 1 < temp <= 2
        temp = state / (size[1] * 1.0)  # defining a temporary variable
        if ((temp).is_integer()):
            raw = int(temp) - 1
        else:
            raw = int(temp)
        # temp = modulo of state and Total column
        # for column1(state 1,5,9) % 4 (total columns) will be 1 [i.e colum = 1-1 = 0]
        # for column1(state 2,6,10) % 4 (total columns) will be 2 [i.e colum = 2-1 = 1]
        temp = state % size[1]
        col = temp - 1
        if col < 0:
            col = size[1] - 1
        else:
            pass
        gotopos.gotopos(raw, col, p, p1, n)  # to go to state that is selected randomly
        time.sleep(0.3)
        # ipdb.set_trace()
        NumOfSelAct = 100*(1-1/(np.exp(0.05*(iteration-2))))
        NumOfSelAct = round(NumOfSelAct*25/100)

        for i in range(0, 20):
            # action selection according to selction of state
            if i < NumOfSelAct:
                possibleActions = action_select(raw, col, n)
                tempList = []
                for j in possibleActions:
                    tempList.append(Q[raw][col][j])
                action = possibleActions[tempList.index(max(tempList))]
		print ("for i") , i, ("selected action is"), action
            else:
                possibleActions = action_select(raw, col, n)
                action = random.choice(possibleActions)

            # defining nextstate according to choosen action
            if action == 0:  # Up movement
                nextstate = Q[raw-1][col]
                rawtemp = raw - 1  # raw of nextstep
                coltemp = col  # col of nextstep
            elif action == 1:  # Down movememt
                nextstate = Q[raw+1][col]
                rawtemp = raw + 1  # raw of nextstep
                coltemp = col  # col of nextstep
            elif action == 2:  # Left movement
                nextstate = Q[raw][col-1]
                rawtemp = raw  # raw of nextstep
                coltemp = col - 1  # col of nextstep
            else:  # Right movement
                # ipdb.set_trace()
                nextstate = Q[raw][col+1]
                rawtemp = raw  # raw of nextstep
                coltemp = col + 1  # col of nextstep
            # ipdb.set_trace()
            # try executing the Q-iteration formula with no errors..
            '''
            _____ADD HERE____
            ACTION_PERFORMANCE FUNCTION
            UPDATE_REWARD FUNCTION
            '''
            ENClast = encoder.getData()
            act.playAction(action, raw, col, size[0], p, p1)
            time.sleep(0.1)
            if action == 0 or action == 1:
                ENClast = encoder.getData()
            ENC = encoder.getData()
	    if ENC - ENClast > 0:
		diff = 1
		if ENC - ENClast > 1:
		    diff = 2
	    elif ENC - ENClast < 0:
		diff = -1
		if ENC - ENClast < -1:
		    diff = -2
	    else:
		diff = 0
	    oldreward = reward[raw][col][action]
	    if (oldreward != 0 and diff == 0) or (np.sign(oldreward)!=np.sign(diff)):
#		restriCount[raw][col][action] += 1
#		if restriCount[raw][col][action] < 3:
    		    print ("!! restriction applied !!")
		    restriCount[raw][col][action] = 0
		    gotopos.gotopos(raw, col, p, p1, n)
		    time.sleep(0.3)
		    ENClast = encoder.getData()
            	    act.playAction(action, raw, col, size[0], p, p1)
                    time.sleep(0.1)
		    if action == 0 or action == 1:
                	ENClast = encoder.getData()
            	    ENC = encoder.getData()
		    diff = ENC - ENClast
	    direction = pinSetup.valueRead_dir()
            reward[raw][col][action] = ((-1)**direction)*diff
            # update_reward.update_reward(reward, raw, col, action, diff)
            kMatrix[raw][col][action] = kMatrix[raw][col][action] + 1
            try:
		alpha = 1/((kMatrix[raw][col][action])**bita)
                Q[raw][col][action] = (1-alpha) * Q[raw][col][action] + \
                    alpha * (reward[raw][col][action] + gama * (max(nextstate)))
                # print "Q", Q
            # tracking if there is a type error (i.e. datatype missmatch) or not in above equation
            except TypeError as e:
                print("TypeError")
            raw = rawtemp
            col = coltemp
	print "iteration is", iteration
        print "qerror is", qError(Q, Qlast)
        print "reward is", reward
	val1 = pinSetup.valueRead_ON()
    # getting the appropriate action back from the given calculated values of Q matrix
    if val1 == 1:
        #import os
        print "Stop"
        #os.system("shutdown now")
    for r in range(0, size[0]):
        for c in range(0, size[1]):
            # ipdb.set_trace()
            possibleActions = action_select(r, c, n)
            tempList = []
            for i in possibleActions:
                tempList.append(Q[r][c][i])
            a[r][c] = possibleActions[tempList.index(max(tempList))]
    # ipdb.set_trace()
    # function returns Q matrix, action matrix and nos of iteration
    print kMatrix
#    print NumOfSelAct
    return Q, a, iteration
Beispiel #6
0
def qLearning(n, p, p1, encoder, ENClast):
    v = initvalact.initvalact(n)
    Q = qinitial.qinitial(n)
    kMatrix = qinitial.qinitial(n)
    Tr = qinitial.qinitial(n)
    a = v[1]
    size = np.shape(Q)  # storing size of Q-matrix
    n = size[0]
    Qlast = generateDummy(
        Q)  # generating dummy of same sizq as Q to enter the while loop
    iteration = 0  # initializing the iteration
    reward = generate_rewardmatrix.generate_rewardmatrix(n)
    state = random.randint(1, size[0] * size[1])
    global val1
    val1 = pinSetup.valueRead_ON()
    while True and val1 == 0:
        iteration += 1  # incresing iteration value
        epsilon = (0.9 / (np.exp(0.05 * (iteration - 1)))) + 0.1
        Qlast = deepcopy(Q)  # copying Q to Qlast
        # state =  selecting state randomly 1every time depending on the Q size
        # temp = to retrive raw and column from Nos of state generated by random selector
        # state / Nos.of column will give us information about the raw number...
        # for 3x4 (raw x column) state 1 to 4 are raw in 1 and state 5 to 8 are raw in 2
        # for raw1(state 1 to 4)/4 (total columns) will be 0 < temp <= 1
        # for raw1(state 5 to 8)/4 (total columns) will be 1 < temp <= 2
        if iteration == 1:
            temp = state / (size[1] * 1.0)  # defining a temporary variable
            if ((temp).is_integer()):
                raw = int(temp) - 1
            else:
                raw = int(temp)
            # temp = modulo of state and Total column
            # for column1(state 1,5,9) % 4 (total columns) will be 1 [i.e colum = 1-1 = 0]
            # for column1(state 2,6,10) % 4 (total columns) will be 2 [i.e colum = 2-1 = 1]
            temp = state % size[1]
            col = temp - 1
            if col < 0:
                col = size[1] - 1
            else:
                pass
            gotopos.gotopos(raw, col, p, p1,
                            n)  # to go to state that is selected randomly
            time.sleep(0.3)

        possibleActions = action_select(raw, col, n)
        probablity = epsilon_greedy_policy(Q[raw][col], possibleActions,
                                           epsilon)
        actionIndex = np.random.choice(len(probablity), p=probablity)
        action = possibleActions[actionIndex]

        # defining nextstate according to choosen action
        if action == 0:  # Up movement
            nextstate = Q[raw - 1][col]
            rawtemp = raw - 1  # raw of nextstep
            coltemp = col  # col of nextstep
        elif action == 1:  # Down movememt
            nextstate = Q[raw + 1][col]
            rawtemp = raw + 1  # raw of nextstep
            coltemp = col  # col of nextstep
        elif action == 2:  # Left movement
            nextstate = Q[raw][col - 1]
            rawtemp = raw  # raw of nextstep
            coltemp = col - 1  # col of nextstep
        else:  # Right movement
            nextstate = Q[raw][col + 1]
            rawtemp = raw  # raw of nextstep
            coltemp = col + 1  # col of nextstep
        # try executing the Q-iteration formula with no errors..
        '''
        _____ADD HERE____
        ACTION_PERFORMANCE FUNCTION
        UPDATE_REWARD FUNCTION
	'''
        ENClast = encoder.getData()
        act.playAction(action, raw, col, size[0], p, p1)
        time.sleep(0.1)
        if action == 0 or action == 1:
            ENClast = encoder.getData()
        ENC = encoder.getData()
        diff = ENC - ENClast
        oldreward = reward[raw][col][action]
        if (oldreward != 0
                and diff == 0) or (np.sign(oldreward) != np.sign(diff)
                                   and oldreward != 0):
            print("!! restriction applied !!")
            gotopos.gotopos(raw, col, p, p1, n)
            time.sleep(0.3)
            ENClast = encoder.getData()
            act.playAction(action, raw, col, size[0], p, p1)
            time.sleep(0.1)
            if action == 0 or action == 1:
                ENClast = encoder.getData()
            ENC = encoder.getData()
            diff = ENC - ENClast
        direction = pinSetup.valueRead_dir()
        reward[raw][col][action] = ((-1)**direction) * diff
        kMatrix[raw][col][action] = kMatrix[raw][col][action] + 1

        try:
            alpha = 1 / ((kMatrix[raw][col][action])**bita)
            eComplement = reward[raw][col][action] + gama * max(
                nextstate) - Q[raw][col][action]
            e = reward[raw][col][action] + gama * max(nextstate) - max(
                Q[raw][col])
            for r in range(size[0]):
                for c in range(size[1]):
                    for actn in action_select(raw, col, n):
                        Tr[r][c][actn] = gama * lamda * Tr[r][c][actn]
                        Q[r][c][
                            actn] = Q[r][c][actn] + alpha * Tr[r][c][actn] * e

            Q[raw][col][action] = Q[raw][col][action] + alpha * eComplement
            Tr[raw][col][action] += 1
        # tracking if there is a type error (i.e. datatype missmatch) or not in above equation
        except TypeError as e:
            print("TypeError")
        print possibleActions
        print probablity
        print "raw= ", raw, "col = ", col, "action = ", action
        raw = rawtemp
        col = coltemp
        print "qerror is", qError(Q, Qlast)
        print "reward is", reward
        print "iteration = ", iteration
        val1 = pinSetup.valueRead_ON()
    # getting the appropriate action back from the given calculated values of Q matrix
    print Tr
    print Q
    for r in range(0, size[0]):
        for c in range(0, size[1]):
            possibleActions = action_select(r, c, n)
            tempList = []
            for i in possibleActions:
                tempList.append(Q[r][c][i])
            a[r][c] = possibleActions[tempList.index(max(tempList))]
    # function returns Q matrix, action matrix and nos of iteration
    return Q, a, iteration