Exemple #1
0
def learnBandit():
    env = UserAdvert()
    rew_vec = []
    W = np.zeros([4, 3])
    for train_step in range(TRAIN_STEPS):
        state = env.getState()
        stateVec = state["stateVec"]
        stateId = state["stateId"]

        # ---- UPDATE code below ------j
        # Sample from policy = softmax(stateVec X W) [W learnable params]
        a = np.transpose(W) * stateVec
        print(a.shape)
        print(W.shape)
        print(stateVec.shape)
        policy = softmax(np.matmul(np.transpose(W) * stateVec))  #3x1
        print(policy)
        # policy = function(stateVec)
        action = int(np.random.choice(3, 1, p=policy))
        reward = env.getReward(stateId, action)
        for i in range(4):
            for j in range(3):
                if (j == action):
                    W[i, j] = W[i, j] + reward * (1 - policy) * s[i]
                else:
                    W[i, j] = W[i, j]

        # ----------------------------

        # ---- UPDATE code below ------
        # Update policy using reward
        policy = softmax(stateVec * W)
        # ----------------------------

        if train_step % LOG_INTERVAL == 0:
            print("Testing at: " + str(train_step))
            count = 0
            test = UserAdvert()
            for e in range(450):
                teststate = test.getState()
                testV = teststate["stateVec"]
                testI = teststate["stateId"]
                # ---- UPDATE code below ------
                policy = softmax(np.matmul(np.transpose(W) * testV))

                # ----------------------------
                act = int(np.random.choice(3, 1, p=policy))
                reward = test.getReward(testI, act)
                count += (reward / 450.0)
            rew_vec.append(count)

    # ---- UPDATE code below ------
    # Plot this rew_vec list
    print(rew_vec)
    plt.plot(LOG_INTERVAL, rew_vec)
Exemple #2
0
def learnBandit():
    env = UserAdvert()
    rew_vec = []
    global W
    global grad
    global average_reward
    for train_step in range(TRAIN_STEPS):
        learning_rate = np.exp(-0.0001 * train_step)
        state = env.getState()
        stateVec = state["stateVec"]
        stateId = state["stateId"]
        # ---- UPDATE code below ---- #
        z = W.dot(stateVec)
        exp = np.exp(z)
        probs = exp / np.sum(exp)
        action = int(np.random.choice(range(3), p=probs.reshape(3, )))
        reward = env.getReward(stateId, action)
        average_reward += (reward - average_reward) / (train_step + 1)
        # ----------------------------

        # ---- UPDATE code below ------
        for act in range(3):
            flag = (1 if act == action else 0)
            W[act] += learning_rate * (reward - average_reward) * (
                flag - probs[act]) * stateVec
        # ----------------------------
        if train_step % LOG_INTERVAL == 0:
            print("Testing at: " + str(train_step))
            count = 0
            test = UserAdvert()
            for e in range(450):
                teststate = test.getState()
                testV = teststate["stateVec"]
                testI = teststate["stateId"]
                # ---- UPDATE code below ------ #
                z = W.dot(testV)
                exp = np.exp(z)
                probs = exp / np.sum(exp)
                # ----------------------------
                act = int(np.random.choice(range(3), p=probs.reshape(3, )))
                reward = test.getReward(testI, act)
                count += (reward / 450.0)
            rew_vec.append(count)

    # ---- UPDATE code below ------
    mpt.plot(rew_vec)
    mpt.title('alpha = exp(-0.0001*t)')
    mpt.ylabel('Average Reward')
    mpt.xlabel('Iterations')
    mpt.savefig()
Exemple #3
0
def learnBandit():
    env = UserAdvert()
    rew_vec = []

    W = np.random.randn(4, 3)

    for train_step in range(TRAIN_STEPS):
        state = env.getState()
        stateVec = state["stateVec"]
        stateId = state["stateId"]

        # ---- UPDATE code below ------j
        # Sample from policy = softmax(stateVec X W) [W learnable params]
        # policy = function (stateVec)
        policy = softmax(np.dot(stateVec.T, W))
        action = int(np.random.choice(range(3), p=policy))
        reward = env.getReward(stateId, action)
        # ----------------------------

        # ---- UPDATE code below ------
        # Update policy using reward
        W += STEP_SIZE * reward * np.dot(stateVec.reshape(STATE_SIZE, 1),
                                         policy.reshape((1, ACTION_SIZE)) - 1)
        # ----------------------------

        if train_step % LOG_INTERVAL == 0:
            print("Testing at: " + str(train_step))
            count = 0
            test = UserAdvert()
            for e in range(450):
                teststate = test.getState()
                testV = teststate["stateVec"]
                testI = teststate["stateId"]
                # ---- UPDATE code below ------
                # Policy = function(testV)
                policy = softmax(np.dot(testV.T, W))
                # ----------------------------
                act = int(np.random.choice(range(3), p=policy))
                reward = test.getReward(testI, act)
                count += (reward / 450.0)
            rew_vec.append(count)

    # ---- UPDATE code below ------
    # Plot this rew_vec list
    # print(rew_vec)
    plt.plot(range(0, TRAIN_STEPS, LOG_INTERVAL), rew_vec)
    plt.show()
def learnBandit():
    env = UserAdvert()
    rew_vec = []

    
    for train_step in range(TRAIN_STEPS):
        state = env.getState()
        stateVec = state["stateVec"] 
        stateId = state["stateId"]   
        
        # ---- UPDATE code below ------
        # Sample from policy = softmax(stateVec X W) [W learnable params]
        # policy = function (stateVec)      
        
        ''' Initializing Weight matrix and the update step parameter alpha '''
        if train_step == 0:
            W = np.zeros([4,3])
            alpha = 0.1

        ''' The preferences of each action obtained by multiplying stateVec and W matrices '''
        prefs = np.dot(stateVec, W)     
        
        ''' The policy is obtained by the softmax probabilities each action '''
        policy = np.exp(prefs)/np.sum(np.exp(prefs))           
        
        ''' Selecting an action based on the probabilities obtained '''
        action = int(np.random.choice(range(3), p = policy)) 
        reward = env.getReward(stateId, action)

        # ----------------------------


        # ---- UPDATE code below ------
        # Update policy using reward
        
        ''' Here "i" loops over all the indexes of stateVec which has same length as W.shape[0] '''
        for i in range(W.shape[0]):

            ''' Here "j" loops over all the indexes of columns of W which has length as W.shape[1] '''
            for j in range(W.shape[1]):
                ''' Here we update each element of W matrix based on REINFORCE algorithm  '''
            
                ''' 
                The formula below is obtained by derivating the action selecting policy
                wrt to each parameter using chain rule.

                All the parameters that are involved in preference of the selected arm
                will get update as per the first formula (if condition) and all the remaining 
                will get updated based on the second formula (else condition)
                '''
                if j == action:
                    W[i,j] = W[i,j] + alpha*reward*policy[action]*(1-policy[action])*stateVec[i]
                else:
                    W[i,j] = W[i,j] - alpha*reward*policy[action]*policy[j]*stateVec[i]
        
        # policy = [1/3.0, 1/3.0, 1/3.0]
        # ----------------------------
        # print(policy)
        if train_step % LOG_INTERVAL == 0:
            print("Testing at: " + str(train_step))
            count = 0
            test = UserAdvert()
            for e in range(450):
                teststate = test.getState()
                testV = teststate["stateVec"]
                testI = teststate["stateId"]
                # ---- UPDATE code below ------
                # Policy = function(testV)

                ''' The preferences of actions obtained by multiplying testV and W '''
                prefs = np.dot(testV, W)
                
                ''' The policy is obtained by the softmax probabilities each action '''
                policy = np.exp(prefs)/np.sum(np.exp(prefs))
                # policy = [1/3.0, 1/3.0, 1/3.0]
                # ----------------------------
#                 print(policy)
                act = int(np.random.choice(range(3), p=np.squeeze(policy)))
                reward = test.getReward(testI, act)
                count += (reward/450.0)
            rew_vec.append(count)

    # ---- UPDATE code below ------
    # Plot this rew_vec list

    ''' Figure instances will be returned. '''
    fig1=plt.figure(figsize=(10,6)).add_subplot(111)
    
    ''' Plot the Average reward Vs Time step for the contextual bandit with 3 arms ''' 
    fig1.plot(range(len(rew_vec)), rew_vec, 'r')

    ''' Labelling the  plot '''
    fig1.title.set_text('Contextual : Average Reward Vs Steps for 3 arms')
    fig1.set_ylabel('Average Reward')
    fig1.set_xlabel('Steps')

    ''' Displaying the plot '''
    plt.show()

    print(rew_vec)
def learnBandit():
    env = UserAdvert()
    rew_vec = []
    rew_vec.append(0)
    rewards = 0.0
    W = np.ones((4, 3))
    Ws = np.ones((4, 3))
    counte = np.zeros((3, 1))
    for train_step in range(TRAIN_STEPS):
        state = env.getState()
        stateVec = state["stateVec"]
        stateId = state["stateId"]
        # ---- UPDATE code below ------
        # Sample from policy = softmax(stateVec X W) [W learnable params]
        # policy = function (stateVec)
        policy = np.dot(stateVec.transpose(),
                        W)  #be the set of parameters that are learnable
        action = int(np.random.choice(range(3)))
        reward = env.getReward(stateId, action)
        # ----------------------------
        # ---- UPDATE code below ------
        # Update policy using reward
        counte[action] = counte[action] + 1
        #no. of a particular action is taken
        if (train_step != 0):
            W[:, action] = W[:, action] - (
                (reward - 3.0) * stateVec
            ) * (1 - policy[action]) / counte[
                action]  #updating the weights of the action of taken according REINFORCE algorithim
        if train_step % LOG_INTERVAL == 0:
            print("Testing at: " + str(train_step))
            count = 0
            test = UserAdvert()
            for e in range(450):
                teststate = test.getState()
                testV = teststate["stateVec"]
                testI = teststate["stateId"]
                # ---- UPDATE code below ------
                # Policy = function(testV)
                policy = np.dot(testV.transpose(), W)
                exp_p = exp(-policy)
                sums = np.sum(exp_p)
                exp_p = exp_p / sums  #softmax policy to determine the action taken
                # ----------------------------
                act = int(np.random.choice(range(3), p=exp_p))
                reward = test.getReward(testI, act)
                count += (reward / 450.0)
            if (count > rewards):
                Ws = W.copy()
                rewards = count
            else:
                W = Ws.copy()
            rew_vec.append(count)

    # ---- UPDATE code below ------
    # Plot this rew_vec list
    plt.plot(range(len(rew_vec) - 1), rew_vec[1:])
    plt.xlabel("Training Steps*10")
    plt.ylabel("Average reward")
    plt.title("Answer 5")
    plt.savefig("Answer5")
    plt.show()
def learnBandit():
    env = UserAdvert()
    rew_vec = []
    
    #Random Initialization of weights
    W = np.random.normal(0,1,size = (ACTION_SIZE,STATE_SIZE))

    for train_step in tqdm(range(TRAIN_STEPS)):
        state = env.getState()
        stateVec = state["stateVec"]
        stateId = state["stateId"]

        # ---- UPDATE code below ------j
        # Sample from policy = softmax(stateVec X W) [W learnable params]
        # policy = function (stateVec)
        
        Context = np.matmul(W,stateVec)
        policy = softmax(Context)
        action = np.random.choice(range(ACTION_SIZE),1,p=policy)
        
        #action = int(np.random.choice(range(3)))
        reward = env.getReward(stateId, int(action))
        # ----------------------------

        # ---- UPDATE code below ------
        # Update policy using reward
        #policy = [1/3.0, 1/3.0, 1/3.0]
        grad = -policy
        grad[action] +=1
        grad = np.expand_dims(grad,axis=1)
        stateVec = np.expand_dims(stateVec,axis=1)
        grad = np.matmul(grad,np.transpose(stateVec))
        
        W = W + lr*(reward-bn)*grad
        # ----------------------------

        if train_step % LOG_INTERVAL == 0:
            #print("Testing at: " + str(train_step))
            count = 0
            test = UserAdvert()
            for e in range(450):
                teststate = test.getState()
                testV = teststate["stateVec"]
                testI = teststate["stateId"]
                # ---- UPDATE code below ------
                # Policy = function(testV)
                policy = [1/3.0, 1/3.0, 1/3.0]
                Context = np.matmul(W,testV)
                policy = softmax(Context)
                # ----------------------------
                act = int(np.random.choice(range(3), p=policy))
                reward = test.getReward(testI, act)
                count += (reward/450.0)
            rew_vec.append(count)

    # ---- UPDATE code below ------
    # Plot this rew_vec list
    x= [i for i in range(int(TRAIN_STEPS/LOG_INTERVAL))]
    fig1=plt.figure(figsize=(10,7)).add_subplot(111)
    fig1.set_xlabel('epochs')
    fig1.set_ylabel('Average Returns')
    fig1.plot(x,rew_vec)
    plt.show()