def exp3(self, numActions, gamma, rewardMin=0, rewardMax=1):
        #weights = [1.0] * numActions

        t = 0
        while True:
            probabilityDistribution = distr(self.weights, gamma)
            choice = draw(probabilityDistribution)
            self.choiceFreq[choice] += 1
            #put choice in the queue
            self.choice_queue.put(choice)
            self.choice_queue.join()
            #get reward from queue
            theReward = self.reward_queue.get()
            self.reward_queue.task_done()
            #theReward = reward(choice, t)
            scaledReward = (theReward - rewardMin) / (
                rewardMax - rewardMin)  # rewards scaled to 0,1
            probChoice = float(self.choiceFreq[choice] + 1) / (t + 1)
            #estimatedReward = 1.0 * scaledReward / probabilityDistribution[choice]
            #estimatedReward = (1.0 * scaledReward) / probChoice
            estimatedReward = 1.0 * scaledReward * self.distanceLastchoice(
                t, choice)
            self.weights[choice] *= math.exp(
                estimatedReward * gamma /
                numActions)  # important that we use estimated reward here!

            yield choice, theReward, estimatedReward, self.weights
            t = t + 1
Ejemplo n.º 2
0
 def exp3(self,numActions, gamma, rewardMin = 0, rewardMax = 1):
    t = 0
    while True:
       probabilityDistribution = distr(self.weights, gamma)
       choice = draw(probabilityDistribution)
       self.choiceFreq[choice]+=1
       #put choice in the queue
       self.choice_queue.put(choice)
       self.choice_queue.join()
       #get reward from queue
       theReward = self.reward_queue.get()
       self.reward_queue.task_done()
       #theReward = reward(choice, t)
       
       scaledReward = (theReward - rewardMin) / (rewardMax - rewardMin) # rewards scaled to 0,1
       
       #-case frequency 
       #probChoice=float(self.choiceFreq[choice] +1)/(t+1)
       #estimatedReward = (1.0 * scaledReward) / probChoice
       
       #-case distribution
       #estimatedReward = 1.0 * scaledReward / probabilityDistribution[choice]
       
       #-case distance
       estimatedReward = 1.0 * scaledReward * self.distanceLastchoice(t,choice)
       
       self.weights[choice] *= math.exp(estimatedReward * gamma / numActions) # important that we use estimated reward here!
       yield choice, theReward, estimatedReward, self.weights
       t = t + 1
Ejemplo n.º 3
0
def exp3(numActions, reward, gamma, rewardMin = 0, rewardMax = 1):
   weights = [1.0] * numActions

   t = 0
   while True:
      probabilityDistribution = distr(weights, gamma)
      choice = draw(probabilityDistribution)
      theReward = reward(choice, t)
      scaledReward = (theReward - rewardMin) / (rewardMax - rewardMin) # rewards scaled to 0,1

      estimatedReward = 1.0 * scaledReward / probabilityDistribution[choice]
      weights[choice] *= math.exp(estimatedReward * gamma / numActions) # important that we use estimated reward here!

      yield choice, theReward, estimatedReward, weights
      t = t + 1
Ejemplo n.º 4
0
Archivo: exp3.py Proyecto: zn16/exp3
def exp3(numActions, reward, gamma, rewardMin=0, rewardMax=1):
    weights = [1.0] * numActions

    t = 0
    while True:
        probabilityDistribution = distr(weights, gamma)
        choice = draw(probabilityDistribution)
        theReward = reward(choice, t)
        scaledReward = (theReward - rewardMin) / (rewardMax - rewardMin
                                                  )  # rewards scaled to 0,1

        estimatedReward = 1.0 * scaledReward / probabilityDistribution[choice]
        weights[choice] *= math.exp(
            estimatedReward * gamma /
            numActions)  # important that we use estimated reward here!

        yield choice, theReward, estimatedReward, weights
        t = t + 1
Ejemplo n.º 5
0
Archivo: exp3.py Proyecto: zn16/exp3
def simpleTest():
    numActions = 10
    numRounds = 10000

    biases = [1.0 / k for k in range(2, 12)]
    rewardVector = [[1 if random.random() < bias else 0 for bias in biases]
                    for _ in range(numRounds)]
    rewards = lambda choice, t: rewardVector[t][choice]

    bestAction = max(range(numActions),
                     key=lambda action: sum(
                         [rewardVector[t][action] for t in range(numRounds)]))
    bestUpperBoundEstimate = 2 * numRounds / 3
    gamma = math.sqrt(numActions * math.log(numActions) /
                      ((math.e - 1) * bestUpperBoundEstimate))
    gamma = 0.07

    cumulativeReward = 0
    bestActionCumulativeReward = 0
    weakRegret = 0

    t = 0
    for (choice, reward, est, weights) in exp3(numActions, rewards, gamma):
        cumulativeReward += reward
        bestActionCumulativeReward += rewardVector[t][bestAction]

        weakRegret = (bestActionCumulativeReward - cumulativeReward)
        regretBound = (math.e - 1) * gamma * bestActionCumulativeReward + (
            numActions * math.log(numActions)) / gamma

        print("regret: %d\tmaxRegret: %.2f\tweights: (%s)" %
              (weakRegret, regretBound, ', '.join(
                  ["%.3f" % weight for weight in distr(weights)])))

        t += 1
        if t >= numRounds:
            break

    print(cumulativeReward)
Ejemplo n.º 6
0
def simpleTest():
   numActions = 10
   numRounds = 10000

   biases = [1.0 / k for k in range(2,12)]
   rewardVector = [[1 if random.random() < bias else 0 for bias in biases] for _ in range(numRounds)]
   rewards = lambda choice, t: rewardVector[t][choice]

   bestAction = max(range(numActions), key=lambda action: sum([rewardVector[t][action] for t in range(numRounds)]))
   bestUpperBoundEstimate = 2 * numRounds / 3
   gamma = math.sqrt(numActions * math.log(numActions) / ((math.e - 1) * bestUpperBoundEstimate))
   gamma = 0.07

   cumulativeReward = 0
   bestActionCumulativeReward = 0
   weakRegret = 0

   t = 0
   for (choice, reward, est, weights) in exp3(numActions, rewards, gamma):
      cumulativeReward += reward
      bestActionCumulativeReward += rewardVector[t][bestAction]

      weakRegret = (bestActionCumulativeReward - cumulativeReward)
      regretBound = (math.e - 1) * gamma * bestActionCumulativeReward + (numActions * math.log(numActions)) / gamma

      print("regret: %d\tmaxRegret: %.2f\tweights: (%s)" % (weakRegret, regretBound, ', '.join(["%.3f" % weight for weight in distr(weights)])))

      t += 1
      if t >= numRounds:
         break

   print(cumulativeReward)