def __init__(self, env, t, alphas=None, betas=None, thres=0.5, iters_per_arm=100): self.env = snapshot(env, t) self.t = t self.arms = two_cycles(self.env, t) self.n_arms = len(self.arms) self.iters_per_arm = iters_per_arm self.thres = thres # Prior successes if alphas is None: self.alphas = np.ones(self.n_arms) else: self.alphas = alphas # Prior failures if betas is None: self.betas = np.ones(self.n_arms) else: self.betas = alphas self.s = np.zeros(self.n_arms) # Successes self.f = np.zeros(self.n_arms) # Failures self.r = np.zeros(self.n_arms) # Rewards self.n = np.zeros(self.n_arms) # Visits
def __init__(self, env, t, n_iters=10, method="cycles", mix=0.5, none_prob=None): self.env = snapshot(env, t) self.t = t self.n_iters = n_iters self.method = method if method == "cycles": self.arms = list( map(lambda x: tuple(sorted(x)), two_cycles(self.env, t))) + [None] elif method == "pairs": self.arms = self.env.get_living(self.t) + [None] else: raise ValueError("Unknown optimal simulation method.") self.n_arms = len(self.arms) self.rnn = torch.load("results/policy_function_lstm") self.mix = mix self.none_prob = none_prob
def test_two_cycles_all(env): cycles = two_cycles(env, 4) for i, j in cycles: assert env.has_edge(i, j) assert env.has_edge(j, i) assert env.node[i]["entry"] <= env.node[j]["death"] assert env.node[j]["entry"] <= env.node[i]["death"] assert env.node[i]["d_blood"] == 0 or \ env.node[j]["p_blood"] == 3 or \ env.node[i]["d_blood"] == env.node[j]["p_blood"]
def __init__(self, env, t, priors, prior_counts, algo="opt"): self.env = snapshot(env, t) self.t = t self.arms = two_cycles(self.env, t) self.n_arms = len(self.arms) self.successes = np.zeros(self.n_arms) self.solver = optimal self.horizons = [1, 5, 10, 20] self.successes = prior_counts * get_cycle_probabilities( self.env.get_living(t), self.arms, priors) + 1e-8
def __init__(self, env, t, gamma=.1, iters_per_arm=100, thres=0.5): self.env = env self.t = t self.arms = two_cycles(self.env, t) self.n_arms = len(self.arms) self.w = np.ones(self.n_arms) self.p = np.full_like(self.w, fill_value=1/self.n_arms) self.gamma = gamma self.r = np.zeros(self.n_arms) self.n = np.zeros(self.n_arms) self.iters_per_arm = iters_per_arm self.thres = thres
def __init__(self, env, t, c=2, iters_per_arm=100, thres=0.5): self.env = snapshot(env, t) self.t = t self.arms = two_cycles(self.env, t) self.n_arms = len(self.arms) self.c = c self.r = np.zeros(self.n_arms) # Rewards self.n = np.zeros(self.n_arms) # Visits self.iters_per_arm = iters_per_arm self.thres = thres
def __init__(self, env, t, method="cycles"): self.env = snapshot(env, t) self.t = t self.method = method if method == "cycles": self.arms = list( map(lambda x: tuple(sorted(x)), two_cycles(self.env, t))) + [None] elif method == "pairs": self.arms = self.env.get_living(self.t) + [None] else: raise ValueError("Unknown optimal simulation method.") self.n_arms = len(self.arms)
def __init__(self, env, t: int, gamma: float = .1, iters_per_arm: int = 100, max_match: int = 5): self.env = env self.t = t self.max_match = max_match self.cycles = two_cycles(env, t) + [()] self.arms = get_arm_matrix(self.cycles, max_match) self.n_arms = len(self.arms) self.h = max(2, int(geom(env.death_rate).ppf(.9))) self.w = np.ones(self.n_arms) self.p = np.full_like(self.w, fill_value=1 / self.n_arms) self.q = np.full_like(self.w, fill_value=1 / self.n_arms) self.mu = np.full_like(self.w, fill_value=1 / self.n_arms) self.gamma = gamma self.iters_per_arm = iters_per_arm
opt = optimal(env) gre = greedy(env) o = get_n_matched(opt["matched"], 0, env.time_length) g = get_n_matched(gre["matched"], 0, env.time_length) rewards = np.zeros(env.time_length) #%% np.random.seed(clock_seed()) for t in range(env.time_length): probs, count = evaluate_policy(net, env, t, dtype="numpy") for i in range(count): probs, _ = evaluate_policy(net, env, t, dtype="numpy") cycles = two_cycles(env, t) if len(cycles) == 0: break elif len(cycles) == 1: res = cycles.pop() else: sim = MonteCarlo(env, t, probs, n_prior) res = sim.simulate(n_sims) env.removed_container[t].update(res) probs, count = evaluate_policy(net, env, t, dtype="numpy") rewards[t] = len(env.removed_container[t]) print(t, np.mean(rewards[:t + 1]), np.mean(g[:t + 1]), np.mean(o[:t + 1]))