def __call__(self, params, update=True): self._prev_params2 = self._prev_params if self._prev_params is None: if update: self._prev_params = nanguardt(params) return False delta = (self._prev_params - params).cpu().numpy() mags = numpy.abs(delta) dist = nanguard(numpy.sum(mags)) self._last_delta2 = self._last_delta self._last_delta = self.delta self.delta = delta self.dist = dist delta_2step = nanguard(numpy.sum( numpy.abs(self._last_delta2 - delta))) if self._last_delta2 is not None else 0 delta_1step = nanguard(numpy.sum( numpy.abs(self._last_delta - delta))) if self._last_delta is not None else 0 largest = numpy.argsort(delta) if update: self._prev_params = params spaces = " " * 6 ups = ', '.join(f'{idx}={delta[idx]:0.3f}' for idx in largest[:-5:-1]) mag_std = numpy.std(mags) sys.stdout.write( f"converged dist: {dist:0.4f}. /param: {dist/len(params):0.3e}. two step dist: {delta_2step:0.3f}, 1step: {delta_1step:0.3f}, mag std: {mag_std}. top updates: {ups}{spaces}\n" ) sys.stdout.flush() return dist <= self._tol * len(params) or ( delta_2step < delta_1step / 2 and delta_2step > 0)
def from_history(self, history): from web.util import nanguard from web import opts keep = [] wts = False for idx, item in enumerate(history): if not "items" in item: continue if "parent" in item: path = "/" + item["parent"] + "/" + item["name"] if not (item["hash"] == hashlib.sha256(path.encode( "utf-8", ue)).hexdigest()): raise Exception() item["dur"] = nanguard(item.get( "viewend", 0), f"from_history#{idx}.viewend") - nanguard( item.get("viewstart", 0), f"from_history#{idx}.viewstart") if nanguard(item.get("dur", 0)) < opts.minview: if not wts and keep: keep.pop() wts = True continue wts = False keep.append(item) for item in keep: self.update(item, initial=True)
def calc_next_index(self, h, vals, weights, dists_out, debug=False, force=False, existing_val=None): from web.util import nanguard from web import opts modelmin = self.min() modelmax = self.max() low_, high_ = self.softmin(vals[0], inv=True), self.softmin(vals[1]) low = max(low_, -modelmax) high = min(high_, modelmax) dists_out[h] = high - low widx, lidx = self.getidx(low), self.getidx(high) mid = existing_val if mid is None: mid = (low + high) / 2 midx = self.getidx(mid) pos = nanguard((high - modelmin) / (modelmax - modelmin)) prec = opts.precision_func(pos) delta = nanguard(len(self.model) / prec) * 2 #if low < modelmin: widx -= max((delta - 2), 1) #if high > modelmax: lidx += max((delta - 2), 1) #if debug: print(pos, delta, low, high, widx, lidx) seen_enough = len(vals[0]) > opts.min_clean_wins or len(vals[0]) + len( vals[1]) > opts.min_clean_compares finished_enough = max(0, lidx - widx) < delta is_goat = widx >= nanguard(len(self.model) - opts.goat_window) info = { "finished_enough": finished_enough, "seen_enough": seen_enough, "is_goat": is_goat, "wlen": len(vals[0]), "llen": len(vals[1]), "prec": prec, "midx": midx, "lidx": lidx, "widx": widx, "mid": mid, "pos": pos, "delta": delta, "adelta": max(0, lidx - widx), "high": high, "low": low, } if len(vals[0]) > 0 and (is_goat or len( vals[1]) > 1) and seen_enough and finished_enough and not force: return None, info return midx, info
def calculate_dists(self, stats, comparisons, h): from web.util import timing, as_pair, nanguard from web import opts dists = ([-50], [50]) weights = ([1], [1]) for other_hash, wins in comparisons.items(): other_val = self.getval(other_hash) if other_val is None: continue if self.is_dropped(stats, other_hash): continue pair = as_pair(h, other_hash) if pair in stats.too_close: wins = tuple([ nanguard(x + sum(wins) + opts.too_close_boost * stats.too_close[pair]) for x in wins ]) win_ratio = (wins[0]) / (wins[0] + wins[1]) if win_ratio < opts.ambiguity_threshold and win_ratio > 1 - opts.ambiguity_threshold: continue if wins[0] > wins[1]: decayed_ratio = ( (wins[0] / max(1e-10, wins[0] + wins[1])) - 0.5) * 2 dists[0].append(other_val) weights[0].append(decayed_ratio) elif wins[0] < wins[1]: decayed_ratio = (( (wins[1]) / max(1e-10, wins[0] + wins[1])) - 0.5) * 2 dists[1].append(other_val) weights[1].append(decayed_ratio) else: continue return dists, weights
def softmin(self, directional_distances, inv=False): from web.util import nanguard from web import opts if not len(directional_distances): return 100 vals = nanguard( numpy.array(directional_distances) ) #numpy.log2(numpy.maximum(directional_distances*10, 0.0001)) if inv: vals = -vals weight = nanguard( numpy.minimum(opts.softmin_falloff_per_unit**-vals, 2000000)) sum = numpy.sum(weight * vals) total_weight = numpy.sum(weight) res = nanguard(sum / total_weight) if inv: res = -res return res
def getprob(self, item1, item2): import choix from web.util import nanguard a = self.getid(item1["hash"] if type(item1) == dict else item1) b = self.getid(item2["hash"] if type(item2) == dict else item2) a_new = False b_new = False if a >= len(self.model): a_new = True if b >= len(self.model): b_new = True #self.calculate_ranking() if not len(self.model) or a_new or b_new: return 0.5, 0.5 #return f"no model yet. new: {a_new}", f"no model yet. new: {b_new}" ra, rb = choix.probabilities([a, b], self.model) return nanguard(ra, "ra"), nanguard(rb, "rb")
def prepare_pairs(self, stats): from web.util import nanguard from web import opts pairs = [] #removeme = [] #for x in self.all_items: # if self.is_dropped(stats, x)>1 and x in self.ids: # removeme.append(self.ids[x]) #if removeme: # model = list(self.model) # all_items = list(self.all_items) # assert type(self.all_items) == list # for idx in sorted(removeme)[::-1]: # del all_items[idx] # del model[idx] # self.model=numpy.array(model) # self.all_items =numpy.array(all_items) #self.ids = {x: idx for idx, x in enumerate(self.all_items)} for pair, rel_wins in stats.pair_wins.items(): if pair in stats.incomparable_pairs: continue if (self.is_dropped(stats, pair[0]) > 1) or (self.is_dropped( stats, pair[1]) > 1): continue if pair[0] not in self.ids or pair[1] not in self.ids: continue if pair in stats.too_close: rel_wins = tuple([ nanguard(x + sum(rel_wins) + opts.too_close_boost * stats.too_close[pair]) for x in rel_wins ]) if not sum(rel_wins): continue ratio = nanguard((rel_wins[0]) / (rel_wins[0] + rel_wins[1])) scale = 1 #sigmoid(3*(sum(rel_wins)-1)) rel_wins = [scale * ratio, scale * (1 - ratio)] id0, id1 = self.ids[pair[0]], self.ids[pair[1]] if rel_wins[0]: pairs.append((id0, id1, nanguard(rel_wins[0]))) if rel_wins[1]: pairs.append((id1, id0, nanguard(rel_wins[1]))) return pairs
def build_sim_data(self, stats): from web.util import nanguard triplets = [(tuple(self.getid_sim(x) for x in key), vals) for key, vals in stats.triplet_diffs.items() if not any(self.is_dropped(stats, x) for x in key)] edges = list( itertools.chain.from_iterable( itertools.combinations(key, 2) for key, val in triplets)) edge_ratios = list( itertools.chain.from_iterable( (((edge1[0], edge2[0]), nanguard(edge1[1] / (edge1[1] + edge2[1])), nanguard(edge1[1] + edge2[1])) for edge1, edge2 in itertools.combinations(( ((vert1[0], vert2[0]), nanguard(vert1[1] + vert2[1]), vert1, vert2) for vert1, vert2 in itertools.combinations(zip(key, val), 2) ), 2)) for key, val in triplets)) targ = [nanguard(x[1]) for x in edge_ratios] return edges, edge_ratios, targ
def extend_model(self, stats): from web.util import nanguard for pair, rel_wins in stats.pair_wins.items(): self.getid(pair[0]) self.getid(pair[1]) if len(self.model) and len(self.all_items) > len(self.model): newlen = len(self.all_items) - len(self.model) #gp = choix.generate_params(newlen, 0.1) #print(gp.shape) newvals = [] for h in self.all_items[len(self.model):]: dists, weights = self.calculate_dists(stats, stats.comparisons.get(h, {}), h) info = self.calc_next_index(h, dists, weights, {})[1] newvals.append(nanguard(info["mid"])) if len(newvals) != newlen: raise Exception() self.model = numpy.concatenate((self.model, newvals))
def calculate_nearest_neighborhood(self, stats, hashes_to_debug, extra=False, save=True): from web import opts from web.util import timing with timing("calculate_nearest_neighborhood", 0.1): distances = {h: ([-50], [50]) for h in self.all_items} weights = {h: ([1], [1]) for h in self.all_items} sp = {} if save: self.searching_pool = sp iv = {} if save: self.inversions = iv inversions = {} inversion_pool = set() dists = {} if save: self.distances = dists for pair, rel_wins in stats.pair_wins.items(): if self.is_dropped(stats, pair[0]) or self.is_dropped( stats, pair[1]): continue if pair in stats.incomparable_pairs: continue if pair in stats.too_close: rel_wins = tuple([ nanguard(x + sum(rel_wins) + opts.too_close_boost * stats.too_close[pair]) for x in rel_wins ]) win_ratio = (rel_wins[0]) / (rel_wins[0] + rel_wins[1]) win, loss = pair[0], pair[1] count = rel_wins[0] + rel_wins[1] if win_ratio < 0.5: rel_wins = rel_wins[::-1] win, loss = pair[1], pair[0] win_ratio = 1 - win_ratio win_prob, inverted, details = self.check_inversion(stats, pair) if win_prob < 0.5: win_inversions, _ = inversions.setdefault(win, ([], [])) _, loss_inversions = inversions.setdefault(loss, ([], [])) win_inversions.append((pair, win_ratio, win_prob, count)) loss_inversions.append((pair, win_ratio, win_prob, count)) if inverted: inversion_pool.add(pair[0]) inversion_pool.add(pair[1]) iv[pair] = inverted decayed_ratio = (((rel_wins[0] + 1) / (rel_wins[0] + rel_wins[1] + 2)) - 0.5) * 2 if win_ratio < opts.ambiguity_threshold: # don't include ambiguous comparisons when tallying distances # should reduce risk of getting in tangles continue wval = self.getval(win) lval = self.getval(loss) if wval is None or lval is None: continue dist = wval - lval distances[win][0].append(lval) distances[loss][1].append(wval) weights[win][0].append(decayed_ratio) weights[loss][1].append(decayed_ratio) modelmin = self.min() modelmax = self.max() for h, vals in distances.items(): if self.is_dropped(stats, h): continue nextidx, _ = self.calc_next_index(h, vals, weights.get(h), dists, existing_val=self.getval(h)) if (nextidx is not None or not stats.win_counts.get(h)) and h in self.bh2: sp[h] = True print( "len(sp)", len(sp), "len(distances)", len(distances), len(self.bh2), len([x for x in self.all_items if not stats.win_counts.get(x)]), len([ x for x in self.all_items if not stats.win_counts.get(x) and not self.is_dropped(stats, x) ]), ) #q = [] if True: for h in hashes_to_debug: hidx = self.sorted_ids.get(h, -1) in_pool = h in sp in_inv_pool = h in inversion_pool #if not in_pool or in_inv_pool: continue dists = distances[h] win_inversions, loss_inversions = inversions.get(h, ([], [])) wdist, ldist = self.weighted_softmin( *dists[0]), self.weighted_softmin(*dists[1]) val = self.getval(h) pos = (val - modelmin) / (modelmax - modelmin) delta = int( len(self.model) / (opts.min_target_precision + (pos**opts.target_precision_curve) * opts.target_precision_top)) lc = stats.loss_counts.get(h, 0) wc = stats.win_counts.get(h, 0) ##ld = "done" if ld is None else f"{ld:4d}" ##wd = "done" if wd is None else f"{wd:4d}" #if not wc or not lc or ld or wd: continue ##print(f"{wd} <= {wc:4d} {lc:4d} => {ld}") rows = [] low = val - wdist high = val + ldist widx2, lidx2 = self.getidx(low), self.getidx(high) #if low < modelmin: widx2 -= max((delta - 2), 1) #if high > modelmax: lidx2 += max((delta - 2), 1) widx, lidx = max(0, self.getidx(val - wdist)), min( len(self.model) - 1, self.getidx(val + ldist)) midx = (widx + lidx) // 2 mval = self.sorted_model[midx] mdist = mval - val low = max(low, -modelmax) high = min(high, modelmax) vval = (low + high) / 2 vdist = vval - val vidx = self.getidx(vval) waidx = max(0, hidx - 1) laidx = min(len(self.model) - 1, hidx + 1) la_h = self.sorted_hashes[laidx] wa_h = self.sorted_hashes[waidx] la_val = self.getval(la_h) wa_val = self.getval(wa_h) wtidx = hidx - delta ltidx = hidx + delta wthresh_h = self.sorted_hashes[max(0, wtidx)] lthresh_h = self.sorted_hashes[min(ltidx, len(self.sorted_hashes) - 1)] wthresh_val = self.getval(wthresh_h) lthresh_val = self.getval(lthresh_h) print() print(f"wc={wc:2d}") print(f"lc={lc:2d}") print(f"lc+wc={lc+wc:2d}") print(f"in_pool={in_pool}") print(f"in_inv_pool={in_inv_pool}") for pair, win_ratio, win_prob, count in win_inversions: other = [x for x in pair if x != h][0] other_idx = self.sorted_ids[other] other_val = self.getval(other) rows.append(( other_val - val, 9, "iw", other_val, other_idx, other_idx - hidx, f"unexpected win; win ratio: {win_ratio} ({count} samples), expected win prob: {win_prob}" )) for pair, loss_ratio, loss_prob, count in loss_inversions: other = [x for x in pair if x != h][0] other_idx = self.sorted_ids[other] other_val = self.getval(other) rows.append(( other_val - val, 3, "il", other_val, other_idx, other_idx - hidx, f"unexpected loss; loss ratio: {loss_ratio} ({count} samples), expected loss prob: {loss_prob}" )) for windist in dists[0]: if windist == 9 or windist < 0: continue rows.append( (-windist, 0, "win", val - windist, self.getidx(val - windist), self.getidx(val - windist) - hidx, f"expected win")) for lossdist in dists[1]: if lossdist == 9 or lossdist < 0: continue rows.append( (lossdist, 12, "los", val + lossdist, self.getidx(val + lossdist), self.getidx(val + lossdist) - hidx, f"expected loss")) rows.append((modelmin - val, 1, "W", modelmin, 0, 0 - hidx, "model boundary low")) rows.append((-wdist, 1, "w", val - wdist, widx2, widx2 - hidx, "win boundary")) rows.append( (wthresh_val - val, 2, "wt", wthresh_val, wtidx, wtidx - hidx, "search precision threshold, win side")) rows.append((vdist, 4, "v", vval, vidx, vidx - hidx, "midpoint in value space")) rows.append((mdist, 5, "m", mval, midx, midx - hidx, "midpoint in index space")) rows.append((wa_val - val, 6, "wa", wa_val, waidx, waidx - hidx, "prev neighbor")) rows.append((0, 7, "", val, hidx, 0, "item")) rows.append((la_val - val, 8, "la", la_val, laidx, laidx - hidx, "next neighbor")) rows.append( (lthresh_val - val, 10, "lt", lthresh_val, ltidx, ltidx - hidx, "search precision threshold, loss side")) rows.append((ldist, 11, "l", val + ldist, lidx2, lidx2 - hidx, "loss boundary")) rows.append( (modelmax - val, 11, "L", modelmax, len(self.model) - 1, (len(self.model) - 1) - hidx, "model boundary high")) maxstep = 12 for dist, step, label, val, idx, idxdist, desc in sorted( rows, key=lambda x: (x[4], x[0])): prefix = label.rjust(step * 2).ljust(24) label = label.rjust(3) print( f" {prefix} | {label}dist={dist:7.4f} {label}val={val:7.4f} {label}idx={idx:5d} {label}idxdist={idxdist:5d} {desc}" ) #print(sorted(dists[0])) #print(f" wdist={-wdist:7.4f} wval={val-wdist:7.4f} widx={widx:5d} widxdist={self.getidx(val-wdist)-hidx:5d}") #print(f"wthresh={wthresh_val-val:7.4f} wtval={wthresh_val:7.4f} wtidx={wtidx:5d} wtidxdist={wtidx-hidx:5d}") #print(f" vdist={vdist:7.4f} vval={vval:7.4f} vidx={vidx:5d} vidxdist={vidx-hidx:5d}") #print(f" mdist={mdist:7.4f} mval={mval:7.4f} midx={midx:5d} midxdist={midx-hidx:5d}") #print(f" wadist={wa_val-val:7.4f} waval={wa_val:7.4f} waidx={waidx:5d} waidxdist={waidx-hidx:5d}") #print(f" val={val:7.4f} hidx={hidx:5d}") #print(f" ladist={la_val-val:7.4f} laval={la_val:7.4f} laidx={laidx:5d} laidxdist={laidx-hidx:5d}") #print(f"lthresh={lthresh_val-val:7.4f} ltval={lthresh_val:7.4f} ltidx={ltidx:5d} ltidxdist={ltidx-hidx:5d}") #print(f" ldist={ldist:7.4f} lval={val+ldist:7.4f} lidx={lidx:5d} lidxdist={self.getidx(val+ldist)-hidx:5d}") #print(sorted(dists[1])) #print(f"min(model): {min(self.model)}, max(model): {max(self.model)}, mean(model): {numpy.mean(self.model)}") print(f"searching_pool: {len(sp)}, inversions: {len(iv)}")
def getval(self, h): from web.util import nanguard id = self.getid(h) if id >= len(self.model): return None return nanguard(self.model[id])
def getidx(self, val): from web.util import nanguard return nanguard(numpy.searchsorted(self.sorted_model, val))
def update(self, item, initial=False): from web.util import nanguard, as_pair, sigmoid from web import opts age = time.time() - (nanguard(item.get("viewend", 0), "update.viewend") / 1000) decay = nanguard(opts.comparison_decay_func(age)) item["dur"] = nanguard( item.get("viewend", 0) - item.get("viewstart", 0), "update.dur") mag_decay = nanguard( max( min(opts.initial_mag, opts.initial_mag / max(1, (item["dur"] / 1000))), opts.min_mag)) if nanguard(item.get("dur", 0)) < opts.minview and not item.get("fast"): print("\033[31mskipped due to too-low view duration", item, "\033[m") return if not initial: print( f"\033[38mvd: {item.get('dur',0)}, mag_decay: {mag_decay}, age: {age}, time_decay: {decay}\033[m" ) pair = as_pair(*item["items"]) if "similarity" in item: info = item["similarity"] winner = None least_similar = info.get("least_similar", None) most_similar = info.get("most_similar", None) if type(most_similar) == list: least_similar = list(set(range(3)) - set(most_similar))[0] most_similar = None elif type(item.get("preference", None)) != dict: info = {} winner = nanguard(item.get("preference", 1) - 1) least_similar = None else: info = item.get("preference", {}) winner = nanguard(info.get("prefer", 1) - 1) least_similar = None too_close = info.get("too_close", False) incomparable = info.get("incomparable", False) dislike = info.get("dislike", None) strong = info.get("strong", None) if type(item.get("info")) == dict and item["info"].get("t") == [ "inversions", "inversions" ]: mag_decay = mag_decay * opts.inversion_compare_boost + sum( self.pair_wins.get( pair, [0, 0])) * opts.inversion_compare_relboost * mag_decay if not dislike: dislike = [0] * len(item["items"]) for f, dis in zip(item["items"], dislike): if dis: self.dislike[f["hash"]] = True #print("dislike",f) elif f["hash"] in self.dislike: del self.dislike[f["hash"]] #print("undislike",f) if any(dislike): return if too_close: self.too_close[pair] = self.too_close.get(pair, 0) + nanguard( 2 * decay * (1 - sigmoid(mag_decay))) #self.record_win(*item["items"]) #self.record_win(*item["items"][::-1]) elif incomparable: pair = as_pair(*item["items"], strip=True) self.incomparable_pairs[pair] = self.incomparable_pairs.get( pair, 0) + nanguard(decay / mag_decay) elif winner is not None: winning = item["items"][winner] losing = item["items"][1 - winner] self.record_win(winning, losing, nanguard(decay), nanguard(mag_decay)) elif least_similar is not None or most_similar is not None: sim = least_similar if least_similar is not None else most_similar assert 0 <= sim <= 2 s1, s2 = [x for i, x in enumerate(item["items"]) if i != sim] s3 = item["items"][sim] self.record_similar(s1, s2, s3, nanguard(decay), nanguard(mag_decay), invert=(most_similar is not None))