def set_state(self, results): N, self.pdf = LLRcalc.results_to_pdf(results) if self.elo_model == "normalized": mu, var = LLRcalc.stats(self.pdf) # code duplication with LLRcalc if len(results) == 5: self.sigma_pg = (2 * var)**0.5 elif len(results) == 3: self.sigma_pg = var**0.5 else: assert False self.s0, self.s1 = [ self.elo_to_score(elo) for elo in (self.elo0, self.elo1) ] mu_LLR, var_LLR = self.LLR_drift_variance(self.pdf, self.s0, self.s1, None) # llr estimate self.llr = N * mu_LLR self.T = N # now normalize llr (if llr is not legal then the implications # of this are unclear) slope = self.llr / N if self.llr > 1.03 * self.b or self.llr < 1.03 * self.a: self.clamped = True if self.llr < self.a: self.T = self.a / slope self.llr = self.a elif self.llr > self.b: self.T = self.b / slope self.llr = self.b
def SPRT_elo(R, alpha=0.05, beta=0.05, p=0.05, elo0=None, elo1=None, elo_model=None): """ Calculate an elo estimate from an SPRT test.""" assert elo_model in ["BayesElo", "logistic"] # Estimate drawelo out of sample R3 = LLRcalc.regularize([R["losses"], R["draws"], R["wins"]]) drawelo = draw_elo_calc(R3) # Convert the bounds to logistic elo if necessary if elo_model == "BayesElo": lelo0, lelo1 = [bayeselo_to_elo(elo_, drawelo) for elo_ in (elo0, elo1)] else: lelo0, lelo1 = elo0, elo1 # Make the elo estimation object sp = sprt.sprt(alpha=alpha, beta=beta, elo0=lelo0, elo1=lelo1) # Feed the results if "pentanomial" in R.keys(): R_ = R["pentanomial"] else: R_ = R3 sp.set_state(R_) # Get the elo estimates a = sp.analytics(p) # Override the LLR approximation with the exact one a["LLR"] = LLRcalc.LLR_logistic(lelo0, lelo1, R_) del a["clamped"] # Now return the estimates return a
def __init__(self, alpha=0.05, beta=0.05, elo0=0, elo1=5): self.a = math.log(beta / (1 - alpha)) self.b = math.log((1 - beta) / alpha) self.elo0 = elo0 self.elo1 = elo1 self.s0 = LLRcalc.L_(elo0) self.s1 = LLRcalc.L_(elo1) self.clamped = False self.LLR_drift_variance = LLRcalc.LLR_drift_variance_alt2
def elo_to_score(self, elo): """ "elo" is expressed in our current elo_model.""" if self.elo_model == "normalized": nt = elo / LLRcalc.nelo_divided_by_nt return nt * self.sigma_pg + 0.5 else: return LLRcalc.L_(elo)
def lelo_to_elo(self, lelo): """ For external use. "elo" is expressed in our current elo_model. "lelo" is logistic.""" if self.elo_model == "logistic": return lelo score = LLRcalc.L_(lelo) nt = (score - 0.5) / self.sigma_pg return nt * LLRcalc.nelo_divided_by_nt
def outcome_prob(self, elo): """ The probability of a test with the given elo with worse outcome (faster fail, slower pass or a pass changed into a fail).""" s = LLRcalc.L_(elo) mu_LLR, var_LLR = self.LLR_drift_variance(self.pdf, self.s0, self.s1, s) sigma_LLR = math.sqrt(var_LLR) return Brownian(a=self.a, b=self.b, mu=mu_LLR, sigma=sigma_LLR).outcome_cdf(T=self.T, y=self.llr)
def get_elo(results): """ "results" is an array of length 2*n+1 with aggregated frequences for n games.""" results = LLRcalc.regularize(results) games, mu, var = stats(results) stdev = math.sqrt(var) # 95% confidence interval for mu mu_min = mu + Phi_inv(0.025) * stdev / math.sqrt(games) mu_max = mu + Phi_inv(0.975) * stdev / math.sqrt(games) el = elo(mu) elo95 = (elo(mu_max) - elo(mu_min)) / 2.0 los = Phi((mu - 0.5) / (stdev / math.sqrt(games))) return el, elo95, los
def set_state(self, results): N, self.pdf = LLRcalc.results_to_pdf(results) mu_LLR, var_LLR = self.LLR_drift_variance(self.pdf, self.s0, self.s1, None) # llr estimate self.llr = N * mu_LLR self.T = N # now normalize llr (if llr is not legal then the implications # of this are unclear) slope = self.llr / N if self.llr > 1.03 * self.b or self.llr < 1.03 * self.a: self.clamped = True if self.llr < self.a: self.T = self.a / slope self.llr = self.a elif self.llr > self.b: self.T = self.b / slope self.llr = self.b
def update_SPRT(R, sprt): """Sequential Probability Ratio Test sprt is a dictionary with fixed fields 'elo0', 'alpha', 'elo1', 'beta', 'elo_model', 'lower_bound', 'upper_bound'. It also has the following fields 'llr', 'state', 'overshoot' which are updated by this function. Normally this function should be called after each finished game (trinomial) or game pair (pentanomial) but it is safe to call it multiple times with the same parameters. Skipped updates are also handled sensibly. The meaning of the inputs and the fields is as follows. H0: elo = elo0 H1: elo = elo1 alpha = max typeI error (reached on elo = elo0) beta = max typeII error for elo >= elo1 (reached on elo = elo1) 'overshoot' is a dictionary with data for dynamic overshoot estimation. The theoretical basis for this is: Siegmund - Sequential Analysis - Corollary 8.33. The correctness can be verified by simulation https://github.com/vdbergh/simul R['wins'], R['losses'], R['draws'] contains the number of wins, losses and draws R['pentanomial'] contains the pentanomial frequencies elo_model can be either 'BayesElo' or 'logistic' """ # the next two lines are superfluous, but necessary for backward compatibility sprt['lower_bound'] = math.log(sprt['beta'] / (1 - sprt['alpha'])) sprt['upper_bound'] = math.log((1 - sprt['beta']) / sprt['alpha']) elo_model = sprt.get('elo_model', 'BayesElo') assert (elo_model in ['BayesElo', 'logistic']) elo0 = sprt['elo0'] elo1 = sprt['elo1'] # first deal with the legacy BayesElo/trinomial models R3 = LLRcalc.regularize([R['losses'], R['draws'], R['wins']]) if elo_model == 'BayesElo': # estimate drawelo out of sample drawelo = draw_elo_calc(R3) # conversion of bounds to logistic elo lelo0, lelo1 = [bayeselo_to_elo(elo, drawelo) for elo in (elo0, elo1)] else: lelo0, lelo1 = elo0, elo1 # Log-Likelihood Ratio R_ = R.get('pentanomial', R3) sprt['llr'] = LLRcalc.LLR_logistic(lelo0, lelo1, R_) # update the overshoot data if 'overshoot' in sprt: LLR_ = sprt['llr'] o = sprt['overshoot'] num_samples = sum(R_) if num_samples < o['last_update']: # purge? sprt['lost_samples'] = o['last_update'] - num_samples # audit del sprt['overshoot'] # the contract is violated else: if num_samples == o['last_update']: # same data pass elif num_samples == o['last_update'] + 1: # the normal case if LLR_ < o['ref0']: delta = LLR_ - o['ref0'] o['m0'] += delta o['sq0'] += delta**2 o['ref0'] = LLR_ if LLR_ > o['ref1']: delta = LLR_ - o['ref1'] o['m1'] += delta o['sq1'] += delta**2 o['ref1'] = LLR_ else: # Be robust if some updates are lost: reset data collection. # This should not be needed anymore, but just in case... o['ref0'] = LLR_ o['ref1'] = LLR_ o['skipped_updates'] += (num_samples - o['last_update']) - 1 # audit o['last_update'] = num_samples o0 = 0 o1 = 0 if 'overshoot' in sprt: o = sprt['overshoot'] o0 = -o['sq0'] / o['m0'] / 2 if o['m0'] != 0 else 0 o1 = o['sq1'] / o['m1'] / 2 if o['m1'] != 0 else 0 # now check the stop condition sprt['state'] = '' if sprt['llr'] < sprt['lower_bound'] + o0: sprt['state'] = 'rejected' elif sprt['llr'] > sprt['upper_bound'] - o1: sprt['state'] = 'accepted'
def update_SPRT(R, sprt): """Sequential Probability Ratio Test sprt is a dictionary with fixed fields 'elo0', 'alpha', 'elo1', 'beta', 'elo_model', 'lower_bound', 'upper_bound', 'batch_size' It also has the following fields 'llr', 'state', 'overshoot' which are updated by this function. Normally this function should be called each time 'batch_size' games (trinomial) or game pairs (pentanomial) have been completed but it is safe to call it multiple times with the same parameters. The main purpose of this is to be able to recalculate the LLR for old tests. In the unlikely event of a server crash it is possible that some updates may be missed but this situation is also handled sensibly. The meaning of the other inputs and the fields is as follows. H0: elo = elo0 H1: elo = elo1 alpha = max typeI error (reached on elo = elo0) beta = max typeII error for elo >= elo1 (reached on elo = elo1) 'overshoot' is a dictionary with data for dynamic overshoot estimation. The theoretical basis for this is: Siegmund - Sequential Analysis - Corollary 8.33. The correctness can be verified by simulation https://github.com/vdbergh/simul R['wins'], R['losses'], R['draws'] contains the number of wins, losses and draws R['pentanomial'] contains the pentanomial frequencies elo_model can be either 'BayesElo', 'logistic' or 'normalized'""" # the next two lines are superfluous, but unfortunately necessary for backward # compatibility with old tests sprt["lower_bound"] = math.log(sprt["beta"] / (1 - sprt["alpha"])) sprt["upper_bound"] = math.log((1 - sprt["beta"]) / sprt["alpha"]) elo_model = sprt.get("elo_model", "BayesElo") assert elo_model in ["BayesElo", "logistic", "normalized"] elo0 = sprt["elo0"] elo1 = sprt["elo1"] # first deal with the legacy BayesElo/trinomial models R3 = [R.get("losses", 0), R.get("draws", 0), R.get("wins", 0)] if elo_model == "BayesElo": # estimate drawelo out of sample R3_ = LLRcalc.regularize(R3) drawelo = draw_elo_calc(R3_) # conversion of bounds to logistic elo elo0, elo1 = [bayeselo_to_elo(elo, drawelo) for elo in (elo0, elo1)] elo_model = "logistic" R_ = R.get("pentanomial", R3) batch_size = sprt.get("batch_size", 1) # sanity check on batch_size if sum(R_) % batch_size != 0: sprt["illegal_update"] = sum(R_) # audit if "overshoot" in sprt: del sprt["overshoot"] # the contract is violated # Log-Likelihood Ratio assert elo_model in ["logistic", "normalized"] if elo_model == "logistic": sprt["llr"] = LLRcalc.LLR_logistic(elo0, elo1, R_) else: sprt["llr"] = LLRcalc.LLR_normalized(elo0, elo1, R_) # update the overshoot data if "overshoot" in sprt: LLR_ = sprt["llr"] o = sprt["overshoot"] num_samples = sum(R_) if num_samples < o["last_update"]: # purge? sprt["lost_samples"] = o["last_update"] - num_samples # audit del sprt["overshoot"] # the contract is violated else: if num_samples == o["last_update"]: # same data pass elif num_samples == o[ "last_update"] + batch_size: # the normal case if LLR_ < o["ref0"]: delta = LLR_ - o["ref0"] o["m0"] += delta o["sq0"] += delta**2 o["ref0"] = LLR_ if LLR_ > o["ref1"]: delta = LLR_ - o["ref1"] o["m1"] += delta o["sq1"] += delta**2 o["ref1"] = LLR_ else: # Be robust if some updates are lost: reset data collection. # This should not be needed anymore, but just in case... o["ref0"] = LLR_ o["ref1"] = LLR_ o["skipped_updates"] += (num_samples - o["last_update"]) - 1 # audit o["last_update"] = num_samples o0 = 0 o1 = 0 if "overshoot" in sprt: o = sprt["overshoot"] o0 = -o["sq0"] / o["m0"] / 2 if o["m0"] != 0 else 0 o1 = o["sq1"] / o["m1"] / 2 if o["m1"] != 0 else 0 # now check the stop condition sprt["state"] = "" if sprt["llr"] < sprt["lower_bound"] + o0: sprt["state"] = "rejected" elif sprt["llr"] > sprt["upper_bound"] - o1: sprt["state"] = "accepted"