def experiment(): np.random.seed() # MDP mdp = generate_simple_chain(state_n=5, goal_states=[2], prob=.8, rew=1, gamma=.9) # Policy epsilon = Parameter(value=.15) pi = EpsGreedy(epsilon=epsilon, ) # Agent learning_rate = Parameter(value=.2) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = QLearning(pi, mdp.info, agent_params) # Algorithm core = Core(agent, mdp) # Train core.learn(n_steps=10000, n_steps_per_fit=1)
def experiment2(): np.random.seed(3) print('mushroom :') # MDP mdp = generate_simple_chain(state_n=5, goal_states=[2], prob=.8, rew=1, gamma=.9) # Policy epsilon = Parameter(value=.15) pi = EpsGreedy(epsilon=epsilon, ) # Agent learning_rate = Parameter(value=.2) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = QLearning(pi, mdp.info, agent_params) # Algorithm collect_dataset = CollectDataset() callbacks = [collect_dataset] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) dataset = collect_dataset.get() return agent.Q.table
def experiment(): np.random.seed(3) print('hierarchical :') # MDP mdp = generate_simple_chain(state_n=5, goal_states=[2], prob=.8, rew=1, gamma=.9) # Model Block model_block = MBlock(env=mdp, render=False) # Policy epsilon = Parameter(value=.15) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = Parameter(value=.2) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = QLearning(pi, mdp.info, agent_params) # Control Block control_block = ControlBlock(wake_time=1, agent=agent, n_eps_per_fit=None, n_steps_per_fit=1) # Algorithm blocks = [model_block, control_block] order = [0, 1] model_block.add_input(control_block) control_block.add_input(model_block) control_block.add_reward(model_block) computational_graph = ComputationalGraph(blocks=blocks, order=order) core = HierarchicalCore(computational_graph) # Train core.learn(n_steps=100, quiet=True) return agent.Q.table
def experiment(): np.random.seed(3) print('hierarchical :') mdp = GridWorldVanHasselt() # Model Block model_block = MBlock(env=mdp, render=False) # Policy epsilon = ExponentialDecayParameter(value=1, decay_exp=.5, size=mdp.info.observation_space.size) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = ExponentialDecayParameter(value=1., decay_exp=1., size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = QLearning(pi, mdp.info, agent_params) # Control Block control_block = ControlBlock(name='controller', agent=agent, n_steps_per_fit=1) # Algorithm blocks = [model_block, control_block] order = [0, 1] model_block.add_input(control_block) control_block.add_input(model_block) control_block.add_reward(model_block) computational_graph = ComputationalGraph(blocks=blocks, order=order) core = HierarchicalCore(computational_graph) # Train core.learn(n_steps=2000, quiet=True) return agent.Q.table
def experiment2(): np.random.seed(3) print('mushroom :') mdp = GridWorldVanHasselt() # Policy epsilon = ExponentialDecayParameter(value=1, decay_exp=.5, size=mdp.info.observation_space.size) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = ExponentialDecayParameter(value=1., decay_exp=1., size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = QLearning(pi, mdp.info, agent_params) # Algorithm collect_dataset = CollectDataset() callbacks = [collect_dataset] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=2000, n_steps_per_fit=1, quiet=True) # Train dataset = collect_dataset.get() VisualizeControlBlock(dataset) return agent.Q.table
def experiment(policy, name, alg_version): np.random.seed() # MDP if name == "Taxi": mdp = generate_taxi('../grid.txt') max_steps = 100000 evaluation_frequency = 2000 test_samples = 10000 elif name == "NChain-v0": mdp = generate_chain(horizon=1000) max_steps = 5000 evaluation_frequency = 100 test_samples = 10000 elif name == "Loop": mdp = generate_loop(horizon=1000) max_steps = 5000 evaluation_frequency = 100 test_samples = 10000 elif name == "SixArms": mdp = generate_arms(horizon=1000) max_steps = 25000 evaluation_frequency = 500 test_samples = 10000 elif name == "RiverSwim": mdp = generate_river(horizon=1000) max_steps = 5000 evaluation_frequency = 100 test_samples = 10000 else: raise NotImplementedError # Policy # epsilon = ExponentialDecayParameter(value=1., decay_exp=.5, # size=mdp.info.observation_space.size) epsilon_train = ExponentialDecayParameter( value=1., decay_exp=.5, size=mdp.info.observation_space.size) epsilon_test = Parameter(0) pi = policy(epsilon=epsilon_train) # Agent learning_rate = ExponentialDecayParameter(value=1., decay_exp=.2, size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) agent = QLearning(pi, mdp.info, **algorithm_params) # Algorithm collect_dataset = CollectDataset() callbacks = [collect_dataset] core = Core(agent, mdp, callbacks) scores = list() scores_train = list() # Train for n_epoch in range(1, max_steps // evaluation_frequency + 1): print('- Learning:') # learning step pi.set_epsilon(epsilon_train) core.learn(n_steps=evaluation_frequency, n_steps_per_fit=1, quiet=False) dataset = collect_dataset.get() if name == "Taxi": scores_train.append(get_stats(dataset)) elif name in ["SixArms"]: scores_train.append(compute_scores_Loop(dataset, horizon=500)) else: scores_train.append(compute_scores_Loop(dataset)) collect_dataset.clean() mdp.reset() print('- Evaluation:') # evaluation step pi.set_epsilon(epsilon_test) dataset = core.evaluate(n_steps=test_samples, quiet=False) mdp.reset() scores.append(get_stats(dataset)) #np.save(env + '/'+alg_version+'_scores.npy', scores) return scores_train, scores
def experiment(algorithm, name, update_mode, update_type, policy, n_approximators, q_max, q_min, lr_exp, double, file_name, out_dir, collect_qs, seed): set_global_seeds(seed) print('Using seed %s' % seed) # MDP if name == 'Taxi': mdp = generate_taxi('../../grid.txt', horizon=5000) max_steps = 500000 evaluation_frequency = 5000 test_samples = 5000 elif name == 'Chain': mdp = generate_chain(horizon=100) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'Loop': mdp = generate_loop(horizon=100) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'RiverSwim': mdp = generate_river(horizon=100) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'SixArms': mdp = generate_arms(horizon=100) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'KnightQuest': mdp = Gym('KnightQuest-v0', gamma=0.99, horizon=10000) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 else: raise NotImplementedError epsilon_test = Parameter(0) # Agent learning_rate = ExponentialDecayParameter(value=1., decay_exp=lr_exp, size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) if algorithm == 'ql': if policy not in ['boltzmann', 'eps-greedy']: warnings.warn( 'QL available with only boltzmann and eps-greedy policies!') policy = 'eps-greedy' if policy == 'eps-greedy': epsilon_train = ExponentialDecayParameter( value=1., decay_exp=.5, size=mdp.info.observation_space.size) pi = policy_dict[policy](epsilon=epsilon_train) else: beta_train = ExponentialDecayParameter( value=1.5 * q_max, decay_exp=.5, size=mdp.info.observation_space.size) pi = policy_dict[policy](beta=beta_train) if double: agent = DoubleQLearning(pi, mdp.info, **algorithm_params) else: agent = QLearning(pi, mdp.info, **algorithm_params) elif algorithm == 'boot-ql': if policy not in ['boot', 'weighted']: warnings.warn( 'Bootstrapped QL available with only boot and weighted policies!' ) policy = 'boot' pi = policy_dict[policy](n_approximators=n_approximators) algorithm_params = dict(n_approximators=n_approximators, mu=(q_max + q_min) / 2, sigma=q_max - q_min, **algorithm_params) if double: agent = BootstrappedDoubleQLearning(pi, mdp.info, **algorithm_params) else: agent = BootstrappedQLearning(pi, mdp.info, **algorithm_params) epsilon_train = Parameter(0) elif algorithm == 'particle-ql': if policy not in ['weighted', 'vpi']: warnings.warn( 'Particle QL available with only vpi and weighted policies!') policy = 'weighted' pi = policy_dict[policy](n_approximators=n_approximators) algorithm_params = dict(n_approximators=n_approximators, update_mode=update_mode, update_type=update_type, q_max=q_max, q_min=q_min, **algorithm_params) if double: agent = ParticleDoubleQLearning(pi, mdp.info, **algorithm_params) else: agent = ParticleQLearning(pi, mdp.info, **algorithm_params) epsilon_train = Parameter(0) else: raise ValueError() # Algorithm collect_dataset = CollectDataset() collect_qs_callback = CollectQs(agent.approximator) callbacks = [collect_dataset] if collect_qs: callbacks += [collect_qs_callback] core = Core(agent, mdp, callbacks) train_scores = [] test_scores = [] for n_epoch in range(1, max_steps // evaluation_frequency + 1): # Train if hasattr(pi, 'set_epsilon'): pi.set_epsilon(epsilon_train) if hasattr(pi, 'set_eval'): pi.set_eval(False) core.learn(n_steps=evaluation_frequency, n_steps_per_fit=1, quiet=True) dataset = collect_dataset.get() scores = compute_scores(dataset, mdp.info.gamma) # print('Train: ', scores) train_scores.append(scores) collect_dataset.clean() mdp.reset() if hasattr(pi, 'set_epsilon'): pi.set_epsilon(epsilon_test) if hasattr(pi, 'set_eval'): pi.set_eval(True) dataset = core.evaluate(n_steps=test_samples, quiet=True) mdp.reset() scores = compute_scores(dataset, mdp.info.gamma) # print('Evaluation: ', scores) test_scores.append(scores) if collect_qs: qs = collect_qs_callback.get_values() if not os.path.exists(out_dir): os.makedirs(out_dir) np.save(out_dir + '/' + file_name, qs) return train_scores, test_scores
def experiment(algorithm, name, update_mode, update_type, policy, n_approximators, q_max, q_min, lr_exp, R, log_lr, r_max_m, delayed_m, delayed_epsilon, delta, debug, double, regret_test, a, b, mbie_C, value_iterations, tolerance, file_name, out_dir, collect_qs, seed): set_global_seeds(seed) print('Using seed %s' % seed) # MDP if name == 'Taxi': mdp = generate_taxi('../grid.txt', horizon=5000, gamma=0.99) max_steps = 500000 evaluation_frequency = 5000 test_samples = 5000 elif name == 'Chain': mdp = generate_chain(horizon=100, gamma=0.99) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'Gridworld': mdp = generate_gridworld(horizon=100, gamma=0.99) max_steps = 500000 evaluation_frequency = 5000 test_samples = 1000 elif name == 'Loop': mdp = generate_loop(horizon=100, gamma=0.99) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'RiverSwim': mdp = generate_river(horizon=100, gamma=0.99) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 mbie_C = 0.4 elif name == 'SixArms': mdp = generate_arms(horizon=100, gamma=0.99) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 mbie_C = 0.8 elif name == 'ThreeArms': horizon = 100 mdp = generate_three_arms(horizon=horizon, gamma=0.99) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'KnightQuest': mdp = None try: mdp = Gym('KnightQuest-v0', gamma=0.99, horizon=10000) except: register( id='KnightQuest-v0', entry_point='envs.knight_quest:KnightQuest', ) mdp = Gym('KnightQuest-v0', gamma=0.99, horizon=10000) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 else: raise NotImplementedError epsilon_test = Parameter(0) # Agent learning_rate = ExponentialDecayParameter(value=1., decay_exp=lr_exp, size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) if regret_test: max_steps = int(args.max_steps_regret * 1e6) evaluation_frequency = max_steps // 100 test_samples = 1000 if debug: max_steps = 100000 evaluation_frequency = max_steps // 100 test_samples = 1000 if algorithm == 'ql': if policy not in ['boltzmann', 'eps-greedy']: warnings.warn('QL available with only boltzmann and eps-greedy policies!') policy = 'eps-greedy' if policy == 'eps-greedy': epsilon_train = ExponentialDecayParameter(value=1., decay_exp=lr_exp, size=mdp.info.observation_space.size) pi = policy_dict[policy](epsilon=epsilon_train) else: beta_train = ExponentialDecayParameter(value=1.5 * q_max, decay_exp=.5, size=mdp.info.observation_space.size) pi = policy_dict[policy](beta=beta_train) if double: agent = DoubleQLearning(pi, mdp.info, **algorithm_params) else: agent = QLearning(pi, mdp.info, **algorithm_params) elif algorithm == 'boot-ql': if policy not in ['boot', 'weighted']: warnings.warn('Bootstrapped QL available with only boot and weighted policies!') policy = 'boot' pi = policy_dict[policy](n_approximators=n_approximators) algorithm_params = dict(n_approximators=n_approximators, mu=(q_max + q_min) / 2, sigma=(q_max - q_min)/2, **algorithm_params) if double: agent = BootstrappedDoubleQLearning(pi, mdp.info, **algorithm_params) else: agent = BootstrappedQLearning(pi, mdp.info, **algorithm_params) epsilon_train = Parameter(0) elif algorithm == 'particle-ql': if policy not in ['weighted', 'ucb']: warnings.warn('Particle QL available with only ucb and weighted policies!') policy = 'weighted' if policy == 'ucb': pi = UCBPolicy(delta=delta, q_max=R/(1-mdp.info.gamma)) else: pi = policy_dict[policy](n_approximators=n_approximators) algorithm_params = dict(n_approximators=n_approximators, update_mode=update_mode, update_type=update_type, q_max=q_max, q_min=q_min, delta=delta, **algorithm_params) if double: agent = ParticleDoubleQLearning(pi, mdp.info, **algorithm_params) else: agent = ParticleQLearning(pi, mdp.info, **algorithm_params) epsilon_train = Parameter(0) elif algorithm == 'r-max': thr_1 = int(np.ceil((4 * mdp.info.size[0] * 1.0/(1-mdp.info.gamma) * R )**3)) algorithm_params = dict( rmax=R, s_a_threshold=r_max_m ) agent = RMaxAgent(mdp.info, **algorithm_params) pi = agent epsilon_train = Parameter(0) elif algorithm == 'mbie': algorithm_params = dict( rmax=R, C=mbie_C, value_iterations=value_iterations, tolerance=tolerance ) agent = MBIE_EB(mdp.info, **algorithm_params) pi = agent epsilon_train = Parameter(0) elif algorithm == 'delayed-ql': theoretic_m = delayed_m if regret_test: gamma = mdp.info.gamma Vmax = R / (1 - gamma) epsilon = args.delayed_ratio * Vmax delayed_epsilon = epsilon*(1-gamma) delta = 0.1 S, A = mdp.info.size theoretic_m = (1 + gamma*Vmax)**2 / (2*delayed_epsilon**2) * np.log(3*S*A/delta * (1 + S*A/(delayed_epsilon*(1-gamma)))) if debug: print("Delta:{}".format(delta)) print("R:{}".format(R)) print("Vmax:{}".format(Vmax)) print("Gamma:{}".format(mdp.info.gamma)) print("Epsilon:{}".format(epsilon)) #print("k:{}".format(k)) print("m:{}".format(theoretic_m)) print("S:{}".format(S)) print("A:{}".format(A)) input() def evaluate_policy(P, R, policy): P_pi = np.zeros((S, S)) R_pi = np.zeros(S) for s in range(S): for s1 in range(S): P_pi[s,s1] = np.sum(policy[s, :] * P[s, :, s1]) R_pi[s] = np.sum(policy[s, :] * np.sum(P[s, :, :] * R[s, :, :], axis=-1)) I = np.diag(np.ones(S)) V = np.linalg.solve(I - gamma * P_pi, R_pi) return V algorithm_params = dict( R=R, m=theoretic_m, delta=delta, epsilon=delayed_epsilon, **algorithm_params) agent = DelayedQLearning(mdp.info, **algorithm_params) if regret_test: collect_vs_callback = CollectVs(mdp, agent, evaluate_policy, args.freq_collection) if debug: print("Q:") print(agent.get_approximator()[:, :]) print("Policy:") print(agent.get_policy()) print("V:{}".format(evaluate_policy(mdp.p,mdp.r,agent.get_policy()))) input() pi = agent epsilon_train = Parameter(0) elif algorithm == 'gaussian-ql': if policy not in ['weighted-gaussian', 'ucb']: warnings.warn('Particle QL available with only ucb and weighted policies!') policy = 'weighted-gaussian' if policy == 'ucb': pi = UCBPolicy(delta=delta, q_max=R/(1-mdp.info.gamma)) else: pi = policy_dict[policy]() q_0 = (q_max - q_min) / 2 sigma_0 = (q_max - q_min) / np.sqrt(12) C = 2 * R / (np.sqrt(2 * np.pi) * (1 - mdp.info.gamma) * sigma_0) sigma_lr = None if log_lr: sigma_lr = LogarithmicDecayParameter(value=1., C=C, size=mdp.info.size) init_values = (q_0, sigma_0) if regret_test: sigma_lr = None gamma = mdp.info.gamma T = max_steps S, A = mdp.info.size a = (2 + gamma) / (2 *(1 - gamma)) b = a - 1 c = 1 d = b q_max = R / (1 - gamma) standard_bound = norm.ppf(1 - delta, loc=0, scale=1) #first_fac = np.sqrt(b + T) #second_fac = np.sqrt(a * np.log(S*A*T / delta)) #sigma2_factor = min(np.sqrt(b + T), np.sqrt(a * np.log(S*A*T / delta))) q_0 = q_max sigma1_0 = 0 #sigma2_0 = (R + gamma * q_max) / (standard_bound * np.sqrt(c-1)) * sigma2_factor sigma2_0 = (gamma * q_max) / (c * standard_bound) * np.sqrt(a * np.log(S * A * T / delta)) init_values = (q_0, sigma1_0, sigma2_0) learning_rate = TheoreticalParameter(a=a, b=b, decay_exp=1, size=mdp.info.size) learning_rate_sigma1 = TheoreticalParameter(a=a, b=b, decay_exp=1, size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate, sigma_1_learning_rate=learning_rate_sigma1) sigma_lr = BetaParameter(c=c, d=d, size=mdp.info.size) def evaluate_policy(P, R, policy): P_pi = np.zeros((S, S)) R_pi = np.zeros(S) for s in range(S): for s1 in range(S): P_pi[s,s1] = np.sum(policy[s, :] * P[s, :, s1]) R_pi[s] = np.sum(policy[s, :] * np.sum(P[s, :, :] * R[s, :, :],axis=-1)) I = np.diag(np.ones(S)) V = np.linalg.solve(I - gamma * P_pi, R_pi) return V if debug: print("Delta:{}".format(delta)) print("R:{}".format(R)) print("Gamma:{}".format(mdp.info.gamma)) print("mu0:{}".format(q_0)) print("Sigma1_0:{}".format(sigma1_0)) print("Sigma2_0:{}".format(sigma2_0)) print("a:{}".format(a)) print("b:{}".format(b)) print("c:{}".format(c)) print("d:{}".format(d)) print("T:{}".format(T)) print("S:{}".format(S)) print("A:{}".format(A)) input() algorithm_params = dict( update_mode=update_mode, update_type=update_type, sigma_learning_rate=sigma_lr, init_values=init_values, delta=delta, q_max=q_max, **algorithm_params) if double and not regret_test: agent = GaussianDoubleQLearning(pi, mdp.info, **algorithm_params) else: agent = GaussianQLearning(pi, mdp.info, **algorithm_params) if regret_test: if debug: freq = 10 else: freq = args.freq_collection collect_vs_callback = CollectVs(mdp, agent, evaluate_policy, freq) if debug: print("Policy:") print(agent.get_policy()) print("Q") for state in range(S): means = np.array(agent.approximator.predict(np.array([state]), idx=0)) sigmas1 = np.array(agent.approximator.predict(np.array([state]), idx=1)) sigmas2 = np.array(agent.approximator.predict(np.array([state]), idx=2)) print("Means:{}".format(means)) print("Sigmas1:{}".format(sigmas1)) print("Sigmas2:{}".format(sigmas2)) print("V:{}".format(evaluate_policy(mdp.p,mdp.r,agent.get_policy()))) input() if policy == 'ucb': q = agent.approximator standard_bound = norm.ppf(1 - delta, loc=0, scale=1) def quantile_func(state): means = np.array(q.predict(state, idx=0)) if regret_test: sigmas1 = np.array(q.predict(state, idx=1)) sigmas2 = np.array(q.predict(state, idx=2)) sigmas = sigmas2 #print(sigmas1, sigmas2) else: sigmas = np.array(q.predict(state, idx=1)) out = sigmas * standard_bound + means return out def mu(state): q_list = q.predict(state, idx=0) means = np.array(q_list) return means pi.set_quantile_func(quantile_func) pi.set_mu(mu) epsilon_train = Parameter(0) else: raise ValueError() # Algorithm collect_dataset = CollectDataset() callbacks = [collect_dataset] if collect_qs: if algorithm not in ['r-max']: collect_qs_callback = CollectQs(agent.approximator) callbacks += [collect_qs_callback] if regret_test: callbacks += [collect_vs_callback] core = Core(agent, mdp, callbacks) train_scores = [] test_scores = [] for n_epoch in range(1, max_steps // evaluation_frequency + 1): # Train if hasattr(pi, 'set_epsilon'): pi.set_epsilon(epsilon_train) if hasattr(pi, 'set_eval'): pi.set_eval(False) if regret_test: collect_vs_callback.on() core.learn(n_steps=evaluation_frequency, n_steps_per_fit=1, quiet=True) dataset = collect_dataset.get() scores = compute_scores(dataset, mdp.info.gamma) #print('Train: ', scores) train_scores.append(scores) collect_dataset.clean() mdp.reset() if regret_test: vs = collect_vs_callback.get_values() if not os.path.exists(out_dir): os.makedirs(out_dir) print("Finished {} steps.".format(n_epoch * evaluation_frequency)) np.save(out_dir + "/vs_" + algorithm+"_"+str(seed), vs) np.save(out_dir+"/scores_online" + str(seed), train_scores) collect_vs_callback.off() if hasattr(pi, 'set_epsilon'): pi.set_epsilon(epsilon_test) if hasattr(pi, 'set_eval'): pi.set_eval(True) dataset = core.evaluate(n_steps=test_samples, quiet=True) s = mdp.reset() scores = compute_scores(dataset, mdp.info.gamma) print('Evaluation #%d:%s ' %(n_epoch, scores)) if debug: print("Policy:") print(agent.get_policy()) print("Q") for state in range(S): means = np.array(agent.approximator.predict(np.array([state]), idx=0)) sigmas1 = np.array(agent.approximator.predict(np.array([state]), idx=1)) sigmas2 = np.array(agent.approximator.predict(np.array([state]), idx=2)) print("Means:{}".format(means)) print("Sigmas1:{}".format(sigmas1)) print("Sigmas2:{}".format(sigmas2)) print("V:{}".format(evaluate_policy(mdp.p, mdp.r, agent.get_policy()))) input() test_scores.append(scores) if regret_test: np.save(out_dir + "/scores_offline" + str(seed), test_scores) if collect_qs: qs= collect_qs_callback.get_values() if not os.path.exists(out_dir): os.makedirs(out_dir) np.save(out_dir + '/' + file_name, qs) return train_scores, test_scores