def experiment(n_epochs, n_episodes): np.random.seed() # MDP n_steps = 5000 mdp = InvertedPendulum(horizon=n_steps) # Agent n_tilings = 10 alpha_theta = Parameter(5e-3 / n_tilings) alpha_omega = Parameter(0.5 / n_tilings) alpha_v = Parameter(0.5 / n_tilings) tilings = Tiles.generate(n_tilings, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) phi = Features(tilings=tilings) input_shape = (phi.size,) mu = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) sigma = 1e-1 * np.eye(1) policy = GaussianPolicy(mu, sigma) agent = COPDAC_Q(policy, mu, mdp.info, alpha_theta, alpha_omega, alpha_v, value_function_features=phi, policy_features=phi) # Train dataset_callback = CollectDataset() visualization_callback = Display(agent._V, mu, mdp.info.observation_space.low, mdp.info.observation_space.high, phi, phi) core = Core(agent, mdp, callbacks=[dataset_callback]) for i in range(n_epochs): core.learn(n_episodes=n_episodes, n_steps_per_fit=1, render=False) J = compute_J(dataset_callback.get(), gamma=1.0) dataset_callback.clean() visualization_callback() print('Mean Reward at iteration ' + str(i) + ': ' + str(np.sum(J) / n_steps / n_episodes)) print('Press a button to visualize the pendulum...') input() sigma = 1e-8 * np.eye(1) policy.set_sigma(sigma) core.evaluate(n_steps=n_steps, render=True)
def experiment(mdp, agent_high, agent_low, n_epochs, n_episodes, ep_per_eval, ep_per_fit_low, display, print_j, quiet): np.random.seed() dataset_callback = CollectDataset() computational_graph = build_computational_graph(mdp, agent_low, agent_high, ep_per_fit_low, [dataset_callback]) core = HierarchicalCore(computational_graph) J_list = list() L_list = list() dataset = core.evaluate(n_episodes=ep_per_eval, quiet=quiet) J = compute_J(dataset, gamma=mdp.info.gamma) J_list.append(np.mean(J)) J_low_list = list() L = episodes_length(dataset) L_list.append(np.mean(L)) if print_j: print('Reward at start :', J_list[-1]) for n in range(n_epochs): core.learn(n_episodes=n_episodes, skip=True, quiet=quiet) ll_dataset = dataset_callback.get() dataset_callback.clean() J_low = compute_J(ll_dataset, mdp.info.gamma) J_low_list.append(np.mean(J_low)) if print_j: print('Low level reward at epoch', n, ':', np.mean(J_low)) dataset = core.evaluate(n_episodes=ep_per_eval, quiet=quiet) J = compute_J(dataset, gamma=mdp.info.gamma) J_list.append(np.mean(J)) L = episodes_length(dataset) L_list.append(np.mean(L)) if print_j: print('Reward at epoch ', n, ':', J_list[-1]) if display: core.evaluate(n_episodes=1, render=True) return J_list, L_list, J_low_list
def experiment(alg, params, n_epochs, n_episodes, n_ep_per_fit): np.random.seed() # MDP mdp = Segway() # Policy approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) n_weights = approximator.weights_size mu = np.zeros(n_weights) sigma = 2e-0 * np.ones(n_weights) policy = DeterministicPolicy(approximator) dist = GaussianDiagonalDistribution(mu, sigma) agent = alg(dist, policy, mdp.info, **params) # Train print(alg.__name__) dataset_callback = CollectDataset() core = Core(agent, mdp, callbacks=[dataset_callback]) for i in range(n_epochs): core.learn(n_episodes=n_episodes, n_episodes_per_fit=n_ep_per_fit, render=False) J = compute_J(dataset_callback.get(), gamma=mdp.info.gamma) dataset_callback.clean() p = dist.get_parameters() print('mu: ', p[:n_weights]) print('sigma: ', p[n_weights:]) print('Reward at iteration ' + str(i) + ': ' + str(np.mean(J))) print('Press a button to visualize the segway...') input() core.evaluate(n_episodes=3, render=True)
def experiment(policy, name, alg_version): np.random.seed() # MDP if name == "Taxi": mdp = generate_taxi('../grid.txt') max_steps = 100000 evaluation_frequency = 2000 test_samples = 10000 elif name == "NChain-v0": mdp = generate_chain(horizon=1000) max_steps = 5000 evaluation_frequency = 100 test_samples = 10000 elif name == "Loop": mdp = generate_loop(horizon=1000) max_steps = 5000 evaluation_frequency = 100 test_samples = 10000 elif name == "SixArms": mdp = generate_arms(horizon=1000) max_steps = 25000 evaluation_frequency = 500 test_samples = 10000 elif name == "RiverSwim": mdp = generate_river(horizon=1000) max_steps = 5000 evaluation_frequency = 100 test_samples = 10000 else: raise NotImplementedError # Policy # epsilon = ExponentialDecayParameter(value=1., decay_exp=.5, # size=mdp.info.observation_space.size) epsilon_train = ExponentialDecayParameter( value=1., decay_exp=.5, size=mdp.info.observation_space.size) epsilon_test = Parameter(0) pi = policy(epsilon=epsilon_train) # Agent learning_rate = ExponentialDecayParameter(value=1., decay_exp=.2, size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) agent = QLearning(pi, mdp.info, **algorithm_params) # Algorithm collect_dataset = CollectDataset() callbacks = [collect_dataset] core = Core(agent, mdp, callbacks) scores = list() scores_train = list() # Train for n_epoch in range(1, max_steps // evaluation_frequency + 1): print('- Learning:') # learning step pi.set_epsilon(epsilon_train) core.learn(n_steps=evaluation_frequency, n_steps_per_fit=1, quiet=False) dataset = collect_dataset.get() if name == "Taxi": scores_train.append(get_stats(dataset)) elif name in ["SixArms"]: scores_train.append(compute_scores_Loop(dataset, horizon=500)) else: scores_train.append(compute_scores_Loop(dataset)) collect_dataset.clean() mdp.reset() print('- Evaluation:') # evaluation step pi.set_epsilon(epsilon_test) dataset = core.evaluate(n_steps=test_samples, quiet=False) mdp.reset() scores.append(get_stats(dataset)) #np.save(env + '/'+alg_version+'_scores.npy', scores) return scores_train, scores
def experiment(algorithm, name, update_mode, update_type, policy, n_approximators, q_max, q_min, lr_exp, double, file_name, out_dir, collect_qs, seed): set_global_seeds(seed) print('Using seed %s' % seed) # MDP if name == 'Taxi': mdp = generate_taxi('../../grid.txt', horizon=5000) max_steps = 500000 evaluation_frequency = 5000 test_samples = 5000 elif name == 'Chain': mdp = generate_chain(horizon=100) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'Loop': mdp = generate_loop(horizon=100) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'RiverSwim': mdp = generate_river(horizon=100) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'SixArms': mdp = generate_arms(horizon=100) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'KnightQuest': mdp = Gym('KnightQuest-v0', gamma=0.99, horizon=10000) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 else: raise NotImplementedError epsilon_test = Parameter(0) # Agent learning_rate = ExponentialDecayParameter(value=1., decay_exp=lr_exp, size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) if algorithm == 'ql': if policy not in ['boltzmann', 'eps-greedy']: warnings.warn( 'QL available with only boltzmann and eps-greedy policies!') policy = 'eps-greedy' if policy == 'eps-greedy': epsilon_train = ExponentialDecayParameter( value=1., decay_exp=.5, size=mdp.info.observation_space.size) pi = policy_dict[policy](epsilon=epsilon_train) else: beta_train = ExponentialDecayParameter( value=1.5 * q_max, decay_exp=.5, size=mdp.info.observation_space.size) pi = policy_dict[policy](beta=beta_train) if double: agent = DoubleQLearning(pi, mdp.info, **algorithm_params) else: agent = QLearning(pi, mdp.info, **algorithm_params) elif algorithm == 'boot-ql': if policy not in ['boot', 'weighted']: warnings.warn( 'Bootstrapped QL available with only boot and weighted policies!' ) policy = 'boot' pi = policy_dict[policy](n_approximators=n_approximators) algorithm_params = dict(n_approximators=n_approximators, mu=(q_max + q_min) / 2, sigma=q_max - q_min, **algorithm_params) if double: agent = BootstrappedDoubleQLearning(pi, mdp.info, **algorithm_params) else: agent = BootstrappedQLearning(pi, mdp.info, **algorithm_params) epsilon_train = Parameter(0) elif algorithm == 'particle-ql': if policy not in ['weighted', 'vpi']: warnings.warn( 'Particle QL available with only vpi and weighted policies!') policy = 'weighted' pi = policy_dict[policy](n_approximators=n_approximators) algorithm_params = dict(n_approximators=n_approximators, update_mode=update_mode, update_type=update_type, q_max=q_max, q_min=q_min, **algorithm_params) if double: agent = ParticleDoubleQLearning(pi, mdp.info, **algorithm_params) else: agent = ParticleQLearning(pi, mdp.info, **algorithm_params) epsilon_train = Parameter(0) else: raise ValueError() # Algorithm collect_dataset = CollectDataset() collect_qs_callback = CollectQs(agent.approximator) callbacks = [collect_dataset] if collect_qs: callbacks += [collect_qs_callback] core = Core(agent, mdp, callbacks) train_scores = [] test_scores = [] for n_epoch in range(1, max_steps // evaluation_frequency + 1): # Train if hasattr(pi, 'set_epsilon'): pi.set_epsilon(epsilon_train) if hasattr(pi, 'set_eval'): pi.set_eval(False) core.learn(n_steps=evaluation_frequency, n_steps_per_fit=1, quiet=True) dataset = collect_dataset.get() scores = compute_scores(dataset, mdp.info.gamma) # print('Train: ', scores) train_scores.append(scores) collect_dataset.clean() mdp.reset() if hasattr(pi, 'set_epsilon'): pi.set_epsilon(epsilon_test) if hasattr(pi, 'set_eval'): pi.set_eval(True) dataset = core.evaluate(n_steps=test_samples, quiet=True) mdp.reset() scores = compute_scores(dataset, mdp.info.gamma) # print('Evaluation: ', scores) test_scores.append(scores) if collect_qs: qs = collect_qs_callback.get_values() if not os.path.exists(out_dir): os.makedirs(out_dir) np.save(out_dir + '/' + file_name, qs) return train_scores, test_scores
def experiment(n_epochs, n_episodes): np.random.seed() # MDP n_steps = 5000 mdp = InvertedPendulum(horizon=n_steps) # Agent n_tilings = 11 alpha_r = Parameter(.0001) alpha_theta = Parameter(.001 / n_tilings) alpha_v = Parameter(.1 / n_tilings) tilings = Tiles.generate(n_tilings-1, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) phi = Features(tilings=tilings) tilings_v = tilings + Tiles.generate(1, [1, 1], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) psi = Features(tilings=tilings_v) input_shape = (phi.size,) mu = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) std = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) std_0 = np.sqrt(1.) std.set_weights(np.log(std_0) / n_tilings * np.ones(std.weights_size)) policy = StateLogStdGaussianPolicy(mu, std) agent = SAC_AVG(policy, mdp.info, alpha_theta, alpha_v, alpha_r, lambda_par=.5, value_function_features=psi, policy_features=phi) # Train dataset_callback = CollectDataset() display_callback = Display(agent._V, mu, std, mdp.info.observation_space.low, mdp.info.observation_space.high, phi, psi) core = Core(agent, mdp, callbacks=[dataset_callback]) for i in range(n_epochs): core.learn(n_episodes=n_episodes, n_steps_per_fit=1, render=False) J = compute_J(dataset_callback.get(), gamma=1.) dataset_callback.clean() display_callback() print('Mean Reward at iteration ' + str(i) + ': ' + str(np.sum(J) / n_steps/n_episodes)) print('Press a button to visualize the pendulum...') input() core.evaluate(n_steps=n_steps, render=True)
def experiment(algorithm, name, update_mode, update_type, policy, n_approximators, q_max, q_min, lr_exp, file_name, out_dir, particles, R=1, m=1, collect_qs=False, seed=0): set_global_seeds(seed) print('Using seed %s' % seed) # MDP if name == 'Taxi': mdp = generate_taxi('../../grid.txt', horizon=5000) max_steps = 500000 evaluation_frequency = 5000 test_samples = 5000 elif name == 'Chain': mdp = generate_chain(horizon=100) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'Loop': mdp = generate_loop(horizon=100) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'RiverSwim': mdp = generate_river(horizon=100) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'SixArms': mdp = generate_arms(horizon=100) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'KnightQuest': mdp = None try: mdp = Gym('KnightQuest-v0', gamma=0.99, horizon=10000) except: register( id='KnightQuest-v0', entry_point='envs.knight_quest:KnightQuest', ) mdp = Gym('KnightQuest-v0', gamma=0.99, horizon=10000) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 else: raise NotImplementedError epsilon_test = Parameter(0) # Agent learning_rate = ExponentialDecayParameter(value=1., decay_exp=lr_exp, size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) if algorithm == 'particle-ql': delta = 0.1 if policy not in ['weighted', 'vpi', 'ucb']: warnings.warn( 'Particle QL available with only vpi and weighted policies!') policy = 'weighted' if policy == 'ucb': pi = UCBPolicy(delta=delta, q_max=R / (1 - mdp.info.gamma)) else: pi = policy_dict[policy](n_approximators=n_approximators) algorithm_params = dict(n_approximators=n_approximators, update_mode=update_mode, update_type=update_type, q_max=q_max, q_min=q_min, delta=delta, init_values=particles, **algorithm_params) agent = ParticleQLearning(pi, mdp.info, **algorithm_params) if policy == 'ucb': q = agent.approximator quantiles = [ i * 1. / (n_approximators - 1) for i in range(n_approximators) ] for p in range(n_approximators): if quantiles[p] >= 1 - delta: particle_bound = p break def quantile_func(state, quantile): q_list = list() for i in range(n_approximators): q_list.append(q.predict(state, idx=i)) qs = np.array(q_list) out = np.zeros(qs.shape[1]) out[:] = qs[particle_bound, :] return out def mu(state): q_list = list() for i in range(n_approximators): q_list.append(q.predict(state, idx=i)) qs = np.array(q_list) return np.mean(qs, axis=0) pi.set_quantile_func(quantile_func) pi.set_mu(mu) epsilon_train = Parameter(0) elif algorithm == 'delayed-ql': algorithm_params = dict(R=R, m=m, **algorithm_params) agent = DelayedQLearning(mdp.info, **algorithm_params) pi = agent # Algorithm collect_dataset = CollectDataset() callbacks = [collect_dataset] if collect_qs: collect_qs_callback = CollectQs(agent.approximator) callbacks += [collect_qs_callback] core = Core(agent, mdp, callbacks) train_scores = [] test_scores = [] for n_epoch in range(1, max_steps // evaluation_frequency + 1): # Train if hasattr(pi, 'set_epsilon'): pi.set_epsilon(epsilon_train) if hasattr(pi, 'set_eval'): pi.set_eval(False) core.learn(n_steps=evaluation_frequency, n_steps_per_fit=1, quiet=True) dataset = collect_dataset.get() scores = compute_scores(dataset, mdp.info.gamma) # print('Train: ', scores) train_scores.append(scores) collect_dataset.clean() mdp.reset() if hasattr(pi, 'set_epsilon'): pi.set_epsilon(epsilon_test) if hasattr(pi, 'set_eval'): pi.set_eval(True) dataset = core.evaluate(n_steps=test_samples, quiet=True) mdp.reset() scores = compute_scores(dataset, mdp.info.gamma) # print('Evaluation: ', scores) test_scores.append(scores) if collect_qs: qs = collect_qs_callback.get_values() if not os.path.exists(out_dir): os.makedirs(out_dir) np.save(out_dir + '/' + file_name, qs) return train_scores, test_scores
def experiment(algorithm, name, update_mode, update_type, policy, n_approximators, q_max, q_min, lr_exp, R, log_lr, r_max_m, delayed_m, delayed_epsilon, delta, debug, double, regret_test, a, b, mbie_C, value_iterations, tolerance, file_name, out_dir, collect_qs, seed): set_global_seeds(seed) print('Using seed %s' % seed) # MDP if name == 'Taxi': mdp = generate_taxi('../grid.txt', horizon=5000, gamma=0.99) max_steps = 500000 evaluation_frequency = 5000 test_samples = 5000 elif name == 'Chain': mdp = generate_chain(horizon=100, gamma=0.99) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'Gridworld': mdp = generate_gridworld(horizon=100, gamma=0.99) max_steps = 500000 evaluation_frequency = 5000 test_samples = 1000 elif name == 'Loop': mdp = generate_loop(horizon=100, gamma=0.99) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'RiverSwim': mdp = generate_river(horizon=100, gamma=0.99) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 mbie_C = 0.4 elif name == 'SixArms': mdp = generate_arms(horizon=100, gamma=0.99) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 mbie_C = 0.8 elif name == 'ThreeArms': horizon = 100 mdp = generate_three_arms(horizon=horizon, gamma=0.99) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'KnightQuest': mdp = None try: mdp = Gym('KnightQuest-v0', gamma=0.99, horizon=10000) except: register( id='KnightQuest-v0', entry_point='envs.knight_quest:KnightQuest', ) mdp = Gym('KnightQuest-v0', gamma=0.99, horizon=10000) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 else: raise NotImplementedError epsilon_test = Parameter(0) # Agent learning_rate = ExponentialDecayParameter(value=1., decay_exp=lr_exp, size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) if regret_test: max_steps = int(args.max_steps_regret * 1e6) evaluation_frequency = max_steps // 100 test_samples = 1000 if debug: max_steps = 100000 evaluation_frequency = max_steps // 100 test_samples = 1000 if algorithm == 'ql': if policy not in ['boltzmann', 'eps-greedy']: warnings.warn('QL available with only boltzmann and eps-greedy policies!') policy = 'eps-greedy' if policy == 'eps-greedy': epsilon_train = ExponentialDecayParameter(value=1., decay_exp=lr_exp, size=mdp.info.observation_space.size) pi = policy_dict[policy](epsilon=epsilon_train) else: beta_train = ExponentialDecayParameter(value=1.5 * q_max, decay_exp=.5, size=mdp.info.observation_space.size) pi = policy_dict[policy](beta=beta_train) if double: agent = DoubleQLearning(pi, mdp.info, **algorithm_params) else: agent = QLearning(pi, mdp.info, **algorithm_params) elif algorithm == 'boot-ql': if policy not in ['boot', 'weighted']: warnings.warn('Bootstrapped QL available with only boot and weighted policies!') policy = 'boot' pi = policy_dict[policy](n_approximators=n_approximators) algorithm_params = dict(n_approximators=n_approximators, mu=(q_max + q_min) / 2, sigma=(q_max - q_min)/2, **algorithm_params) if double: agent = BootstrappedDoubleQLearning(pi, mdp.info, **algorithm_params) else: agent = BootstrappedQLearning(pi, mdp.info, **algorithm_params) epsilon_train = Parameter(0) elif algorithm == 'particle-ql': if policy not in ['weighted', 'ucb']: warnings.warn('Particle QL available with only ucb and weighted policies!') policy = 'weighted' if policy == 'ucb': pi = UCBPolicy(delta=delta, q_max=R/(1-mdp.info.gamma)) else: pi = policy_dict[policy](n_approximators=n_approximators) algorithm_params = dict(n_approximators=n_approximators, update_mode=update_mode, update_type=update_type, q_max=q_max, q_min=q_min, delta=delta, **algorithm_params) if double: agent = ParticleDoubleQLearning(pi, mdp.info, **algorithm_params) else: agent = ParticleQLearning(pi, mdp.info, **algorithm_params) epsilon_train = Parameter(0) elif algorithm == 'r-max': thr_1 = int(np.ceil((4 * mdp.info.size[0] * 1.0/(1-mdp.info.gamma) * R )**3)) algorithm_params = dict( rmax=R, s_a_threshold=r_max_m ) agent = RMaxAgent(mdp.info, **algorithm_params) pi = agent epsilon_train = Parameter(0) elif algorithm == 'mbie': algorithm_params = dict( rmax=R, C=mbie_C, value_iterations=value_iterations, tolerance=tolerance ) agent = MBIE_EB(mdp.info, **algorithm_params) pi = agent epsilon_train = Parameter(0) elif algorithm == 'delayed-ql': theoretic_m = delayed_m if regret_test: gamma = mdp.info.gamma Vmax = R / (1 - gamma) epsilon = args.delayed_ratio * Vmax delayed_epsilon = epsilon*(1-gamma) delta = 0.1 S, A = mdp.info.size theoretic_m = (1 + gamma*Vmax)**2 / (2*delayed_epsilon**2) * np.log(3*S*A/delta * (1 + S*A/(delayed_epsilon*(1-gamma)))) if debug: print("Delta:{}".format(delta)) print("R:{}".format(R)) print("Vmax:{}".format(Vmax)) print("Gamma:{}".format(mdp.info.gamma)) print("Epsilon:{}".format(epsilon)) #print("k:{}".format(k)) print("m:{}".format(theoretic_m)) print("S:{}".format(S)) print("A:{}".format(A)) input() def evaluate_policy(P, R, policy): P_pi = np.zeros((S, S)) R_pi = np.zeros(S) for s in range(S): for s1 in range(S): P_pi[s,s1] = np.sum(policy[s, :] * P[s, :, s1]) R_pi[s] = np.sum(policy[s, :] * np.sum(P[s, :, :] * R[s, :, :], axis=-1)) I = np.diag(np.ones(S)) V = np.linalg.solve(I - gamma * P_pi, R_pi) return V algorithm_params = dict( R=R, m=theoretic_m, delta=delta, epsilon=delayed_epsilon, **algorithm_params) agent = DelayedQLearning(mdp.info, **algorithm_params) if regret_test: collect_vs_callback = CollectVs(mdp, agent, evaluate_policy, args.freq_collection) if debug: print("Q:") print(agent.get_approximator()[:, :]) print("Policy:") print(agent.get_policy()) print("V:{}".format(evaluate_policy(mdp.p,mdp.r,agent.get_policy()))) input() pi = agent epsilon_train = Parameter(0) elif algorithm == 'gaussian-ql': if policy not in ['weighted-gaussian', 'ucb']: warnings.warn('Particle QL available with only ucb and weighted policies!') policy = 'weighted-gaussian' if policy == 'ucb': pi = UCBPolicy(delta=delta, q_max=R/(1-mdp.info.gamma)) else: pi = policy_dict[policy]() q_0 = (q_max - q_min) / 2 sigma_0 = (q_max - q_min) / np.sqrt(12) C = 2 * R / (np.sqrt(2 * np.pi) * (1 - mdp.info.gamma) * sigma_0) sigma_lr = None if log_lr: sigma_lr = LogarithmicDecayParameter(value=1., C=C, size=mdp.info.size) init_values = (q_0, sigma_0) if regret_test: sigma_lr = None gamma = mdp.info.gamma T = max_steps S, A = mdp.info.size a = (2 + gamma) / (2 *(1 - gamma)) b = a - 1 c = 1 d = b q_max = R / (1 - gamma) standard_bound = norm.ppf(1 - delta, loc=0, scale=1) #first_fac = np.sqrt(b + T) #second_fac = np.sqrt(a * np.log(S*A*T / delta)) #sigma2_factor = min(np.sqrt(b + T), np.sqrt(a * np.log(S*A*T / delta))) q_0 = q_max sigma1_0 = 0 #sigma2_0 = (R + gamma * q_max) / (standard_bound * np.sqrt(c-1)) * sigma2_factor sigma2_0 = (gamma * q_max) / (c * standard_bound) * np.sqrt(a * np.log(S * A * T / delta)) init_values = (q_0, sigma1_0, sigma2_0) learning_rate = TheoreticalParameter(a=a, b=b, decay_exp=1, size=mdp.info.size) learning_rate_sigma1 = TheoreticalParameter(a=a, b=b, decay_exp=1, size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate, sigma_1_learning_rate=learning_rate_sigma1) sigma_lr = BetaParameter(c=c, d=d, size=mdp.info.size) def evaluate_policy(P, R, policy): P_pi = np.zeros((S, S)) R_pi = np.zeros(S) for s in range(S): for s1 in range(S): P_pi[s,s1] = np.sum(policy[s, :] * P[s, :, s1]) R_pi[s] = np.sum(policy[s, :] * np.sum(P[s, :, :] * R[s, :, :],axis=-1)) I = np.diag(np.ones(S)) V = np.linalg.solve(I - gamma * P_pi, R_pi) return V if debug: print("Delta:{}".format(delta)) print("R:{}".format(R)) print("Gamma:{}".format(mdp.info.gamma)) print("mu0:{}".format(q_0)) print("Sigma1_0:{}".format(sigma1_0)) print("Sigma2_0:{}".format(sigma2_0)) print("a:{}".format(a)) print("b:{}".format(b)) print("c:{}".format(c)) print("d:{}".format(d)) print("T:{}".format(T)) print("S:{}".format(S)) print("A:{}".format(A)) input() algorithm_params = dict( update_mode=update_mode, update_type=update_type, sigma_learning_rate=sigma_lr, init_values=init_values, delta=delta, q_max=q_max, **algorithm_params) if double and not regret_test: agent = GaussianDoubleQLearning(pi, mdp.info, **algorithm_params) else: agent = GaussianQLearning(pi, mdp.info, **algorithm_params) if regret_test: if debug: freq = 10 else: freq = args.freq_collection collect_vs_callback = CollectVs(mdp, agent, evaluate_policy, freq) if debug: print("Policy:") print(agent.get_policy()) print("Q") for state in range(S): means = np.array(agent.approximator.predict(np.array([state]), idx=0)) sigmas1 = np.array(agent.approximator.predict(np.array([state]), idx=1)) sigmas2 = np.array(agent.approximator.predict(np.array([state]), idx=2)) print("Means:{}".format(means)) print("Sigmas1:{}".format(sigmas1)) print("Sigmas2:{}".format(sigmas2)) print("V:{}".format(evaluate_policy(mdp.p,mdp.r,agent.get_policy()))) input() if policy == 'ucb': q = agent.approximator standard_bound = norm.ppf(1 - delta, loc=0, scale=1) def quantile_func(state): means = np.array(q.predict(state, idx=0)) if regret_test: sigmas1 = np.array(q.predict(state, idx=1)) sigmas2 = np.array(q.predict(state, idx=2)) sigmas = sigmas2 #print(sigmas1, sigmas2) else: sigmas = np.array(q.predict(state, idx=1)) out = sigmas * standard_bound + means return out def mu(state): q_list = q.predict(state, idx=0) means = np.array(q_list) return means pi.set_quantile_func(quantile_func) pi.set_mu(mu) epsilon_train = Parameter(0) else: raise ValueError() # Algorithm collect_dataset = CollectDataset() callbacks = [collect_dataset] if collect_qs: if algorithm not in ['r-max']: collect_qs_callback = CollectQs(agent.approximator) callbacks += [collect_qs_callback] if regret_test: callbacks += [collect_vs_callback] core = Core(agent, mdp, callbacks) train_scores = [] test_scores = [] for n_epoch in range(1, max_steps // evaluation_frequency + 1): # Train if hasattr(pi, 'set_epsilon'): pi.set_epsilon(epsilon_train) if hasattr(pi, 'set_eval'): pi.set_eval(False) if regret_test: collect_vs_callback.on() core.learn(n_steps=evaluation_frequency, n_steps_per_fit=1, quiet=True) dataset = collect_dataset.get() scores = compute_scores(dataset, mdp.info.gamma) #print('Train: ', scores) train_scores.append(scores) collect_dataset.clean() mdp.reset() if regret_test: vs = collect_vs_callback.get_values() if not os.path.exists(out_dir): os.makedirs(out_dir) print("Finished {} steps.".format(n_epoch * evaluation_frequency)) np.save(out_dir + "/vs_" + algorithm+"_"+str(seed), vs) np.save(out_dir+"/scores_online" + str(seed), train_scores) collect_vs_callback.off() if hasattr(pi, 'set_epsilon'): pi.set_epsilon(epsilon_test) if hasattr(pi, 'set_eval'): pi.set_eval(True) dataset = core.evaluate(n_steps=test_samples, quiet=True) s = mdp.reset() scores = compute_scores(dataset, mdp.info.gamma) print('Evaluation #%d:%s ' %(n_epoch, scores)) if debug: print("Policy:") print(agent.get_policy()) print("Q") for state in range(S): means = np.array(agent.approximator.predict(np.array([state]), idx=0)) sigmas1 = np.array(agent.approximator.predict(np.array([state]), idx=1)) sigmas2 = np.array(agent.approximator.predict(np.array([state]), idx=2)) print("Means:{}".format(means)) print("Sigmas1:{}".format(sigmas1)) print("Sigmas2:{}".format(sigmas2)) print("V:{}".format(evaluate_policy(mdp.p, mdp.r, agent.get_policy()))) input() test_scores.append(scores) if regret_test: np.save(out_dir + "/scores_offline" + str(seed), test_scores) if collect_qs: qs= collect_qs_callback.get_values() if not os.path.exists(out_dir): os.makedirs(out_dir) np.save(out_dir + '/' + file_name, qs) return train_scores, test_scores