def choose_pursuit(self): means = [ np.mean(L) for L in self.rewards ] id_max = means.index( max(means) ) self.WE = [ pi + self.epsilon*(1.-pi) if i == id_max else pi + self.epsilon*(0.-pi) for i,pi in enumerate( self.WE ) ] id_a = weighted_choice( range(self.K), self.WE ) return id_a
def choose_reinforcement(self): sco = [ math.exp(pi) for pi in self.WE ] sco = [ s / sum(sco) for s in sco ] id_a = weighted_choice( range(self.K), sco ) return id_a
def choose_boltzmann(self): means = [ np.mean(L) for L in self.rewards ] sco = [ math.exp(m / self.epsilon) for m in means ] sco = [ s / sum(sco) for s in sco ] id_a = weighted_choice( range(self.K), sco ) return id_a
def choose_EXP3(self): P = [ (1. - self.epsilon) * w/sum(self.WE) + self.epsilon * 1./self.K for w in self.WE ] id_a = weighted_choice( range(self.K), P ) return id_a