def experiment(policy, value): np.random.seed(45) # MDP mdp = generate_taxi('tests/taxi/grid.txt', rew=(0, 1, 5)) # Policy pi = policy(Parameter(value=value)) # Agent learning_rate = Parameter(value=.15) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = SARSA(pi, mdp.info, agent_params) # Algorithm collect_dataset = CollectDataset() callbacks = [collect_dataset] core = Core(agent, mdp, callbacks) # Train n_steps = 2000 core.learn(n_steps=n_steps, n_steps_per_fit=1, quiet=True) return np.sum(np.array(collect_dataset.get())[:, 2]) / float(n_steps)
def experiment(policy, value): np.random.seed() # MDP mdp = generate_taxi('grid.txt') # Policy pi = policy(Parameter(value=value)) # Agent learning_rate = Parameter(value=.15) algorithm_params = dict(learning_rate=learning_rate) agent = SARSA(pi, mdp.info, **algorithm_params) # Algorithm collect_dataset = CollectDataset() callbacks = [collect_dataset] core = Core(agent, mdp, callbacks) # Train n_steps = 300000 core.learn(n_steps=n_steps, n_steps_per_fit=1, quiet=True) return np.sum(np.array(collect_dataset.get())[:, 2]) / float(n_steps)
def experiment(policy, name, alg_version): np.random.seed() # MDP if name == "Taxi": mdp = generate_taxi('../grid.txt') max_steps = 100000 evaluation_frequency = 2000 test_samples = 10000 elif name == "NChain-v0": mdp = generate_chain(horizon=1000) max_steps = 5000 evaluation_frequency = 100 test_samples = 10000 elif name == "Loop": mdp = generate_loop(horizon=1000) max_steps = 5000 evaluation_frequency = 100 test_samples = 10000 elif name == "SixArms": mdp = generate_arms(horizon=1000) max_steps = 25000 evaluation_frequency = 500 test_samples = 10000 elif name == "RiverSwim": mdp = generate_river(horizon=1000) max_steps = 5000 evaluation_frequency = 100 test_samples = 10000 else: raise NotImplementedError # Policy # epsilon = ExponentialDecayParameter(value=1., decay_exp=.5, # size=mdp.info.observation_space.size) epsilon_train = ExponentialDecayParameter( value=1., decay_exp=.5, size=mdp.info.observation_space.size) epsilon_test = Parameter(0) pi = policy(epsilon=epsilon_train) # Agent learning_rate = ExponentialDecayParameter(value=1., decay_exp=.2, size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) agent = QLearning(pi, mdp.info, **algorithm_params) # Algorithm collect_dataset = CollectDataset() callbacks = [collect_dataset] core = Core(agent, mdp, callbacks) scores = list() scores_train = list() # Train for n_epoch in range(1, max_steps // evaluation_frequency + 1): print('- Learning:') # learning step pi.set_epsilon(epsilon_train) core.learn(n_steps=evaluation_frequency, n_steps_per_fit=1, quiet=False) dataset = collect_dataset.get() if name == "Taxi": scores_train.append(get_stats(dataset)) elif name in ["SixArms"]: scores_train.append(compute_scores_Loop(dataset, horizon=500)) else: scores_train.append(compute_scores_Loop(dataset)) collect_dataset.clean() mdp.reset() print('- Evaluation:') # evaluation step pi.set_epsilon(epsilon_test) dataset = core.evaluate(n_steps=test_samples, quiet=False) mdp.reset() scores.append(get_stats(dataset)) #np.save(env + '/'+alg_version+'_scores.npy', scores) return scores_train, scores
def experiment(args, agent_algorithm): np.random.seed() scores = list() #add timestamp to results ts = str(time.time()) # Evaluation of the model provided by the user. if args.load_path and args.evaluation: # MDP if args.name not in ['Taxi', 'Gridworld']: mdp = Gym(args.name, args.horizon, args.gamma) n_states = None gamma_eval = 1. elif args.name == 'Taxi': mdp = generate_taxi('../../grid.txt') n_states = mdp.info.observation_space.size[0] gamma_eval = mdp.info.gamma else: rew_weights = [args.fast_zone, args.slow_zone, args.goal] grid_size = args.grid_size env = GridWorld(gamma=args.gamma, rew_weights=rew_weights, shape=(grid_size, grid_size), randomized_initial=args.rand_initial, horizon=args.horizon) gamma_eval = args.gamma mdp = env.generate_mdp() n_states = mdp.info.observation_space.size[0] # Policy epsilon_test = Parameter(value=args.test_exploration_rate) pi = BootPolicy(args.n_approximators, epsilon=epsilon_test) # Approximator input_shape = mdp.info.observation_space.shape + (1, ) input_preprocessor = list() approximator_params = dict(input_shape=input_shape, output_shape=(mdp.info.action_space.n, ), n_states=n_states, n_actions=mdp.info.action_space.n, n_features=args.n_features, n_approximators=args.n_approximators, input_preprocessor=input_preprocessor, name='test', load_path=args.load_path, net_type=args.net_type, optimizer={ 'name': args.optimizer, 'lr': args.learning_rate, 'lr_sigma': args.learning_rate, 'decay': args.decay, 'epsilon': args.epsilon }) approximator = SimpleNet # Agent algorithm_params = dict(batch_size=0, initial_replay_size=0, max_replay_size=0, clip_reward=False, target_update_frequency=1) if args.alg == 'boot': algorithm_params['p_mask'] = args.p_mask pi = BootPolicy(args.n_approximators, epsilon=epsilon_test) elif args.alg == 'gaussian': if args.ucb: pi = UCBPolicy(delta=args.delta, q_max=1. / (1. - args.gamma)) else: pi = WeightedGaussianPolicy(epsilon=epsilon_test) elif args.alg == 'dqn': pi = EpsGreedy(epsilon=epsilon_test) elif args.alg == 'particle': if args.ucb: pi = UCBPolicy(delta=args.delta, q_max=1. / (1. - args.gamma)) else: pi = WeightedPolicy(args.n_approximators, epsilon=epsilon_test) else: raise ValueError("Algorithm uknown") if args.alg in ['gaussian', 'particle']: algorithm_params['update_type'] = args.update_type algorithm_params['delta'] = args.delta algorithm_params['store_prob'] = args.store_prob if args.clip_target: algorithm_params['max_spread'] = args.q_max - args.q_min approximator_params['q_min'] = args.q_min approximator_params['q_max'] = args.q_max approximator_params['loss'] = args.loss approximator_params['init_type'] = args.init_type approximator_params['sigma_weight'] = args.sigma_weight if args.alg in ['particle', 'boot']: approximator_params['n_approximators'] = args.n_approximators algorithm_params['n_approximators'] = args.n_approximators agent = agent_algorithm(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) # Algorithm core_test = Core(agent, mdp) # Evaluate model pi.set_eval(True) dataset = core_test.evaluate(n_steps=args.test_samples, render=args.render, quiet=args.quiet) get_stats(dataset) else: # DQN learning run print("Learning Run") # Settings if args.debug: initial_replay_size = 50 max_replay_size = 500 train_frequency = 5 target_update_frequency = 10 test_samples = 20 evaluation_frequency = 50 max_steps = 1000 else: initial_replay_size = args.initial_replay_size max_replay_size = args.max_replay_size train_frequency = args.train_frequency target_update_frequency = args.target_update_frequency test_samples = args.test_samples evaluation_frequency = args.evaluation_frequency max_steps = args.max_steps # MDP if args.name not in ['Taxi', 'Gridworld']: mdp = Gym(args.name, args.horizon, args.gamma) n_states = None gamma_eval = 1. elif args.name == 'Taxi': mdp = generate_taxi('../../grid.txt') n_states = mdp.info.observation_space.size[0] gamma_eval = mdp.info.gamma else: rew_weights = [args.fast_zone, args.slow_zone, args.goal] grid_size = args.grid_size env = GridWorld(gamma=args.gamma, rew_weights=rew_weights, shape=(grid_size, grid_size), randomized_initial=args.rand_initial, horizon=args.horizon) mdp = env.generate_mdp() n_states = mdp.info.observation_space.size[0] print(mdp.info.gamma) gamma_eval = args.gamma # Policy epsilon = LinearDecayParameter(value=args.initial_exploration_rate, min_value=args.final_exploration_rate, n=args.final_exploration_frame) epsilon_test = Parameter(value=args.test_exploration_rate) epsilon_random = Parameter(value=1.) policy_name = 'weighted' update_rule = args.update_type + "_update" if args.alg == 'boot': pi = BootPolicy(args.n_approximators, epsilon=epsilon) policy_name = 'boot' update_rule = 'boot' elif args.alg == 'dqn': pi = EpsGreedy(epsilon=epsilon) policy_name = 'eps_greedy' update_rule = 'td' elif args.alg == 'particle': if args.ucb: policy_name = 'ucb' pi = UCBPolicy(delta=args.delta, q_max=1. / (1. - args.gamma)) else: pi = WeightedPolicy(args.n_approximators) elif args.alg == 'gaussian': if args.ucb: policy_name = 'ucb' pi = UCBPolicy(delta=args.delta, q_max=1. / (1. - args.gamma)) else: pi = WeightedGaussianPolicy() else: raise ValueError("Algorithm unknown") # Summary folder folder_name = './logs/' + args.alg + "/" + policy_name + '/' + update_rule + '/' + args.name + "/" + args.loss + "/" + str( args.n_approximators ) + "_particles" + "/" + args.init_type + "_init" + "/" + str( args.learning_rate) + "/" + ts # Approximator input_shape = mdp.info.observation_space.shape input_preprocessor = list() approximator_params = dict(input_shape=input_shape, output_shape=(mdp.info.action_space.n, ), n_states=n_states, n_actions=mdp.info.action_space.n, n_features=args.n_features, n_approximators=args.n_approximators, input_preprocessor=input_preprocessor, folder_name=folder_name, net_type=args.net_type, sigma_weight=args.sigma_weight, optimizer={ 'name': args.optimizer, 'lr': args.learning_rate, 'lr_sigma': args.learning_rate, 'decay': args.decay, 'epsilon': args.epsilon }) if args.load_path: ts = os.path.basename(os.path.normpath(args.load_path)) approximator_params['load_path'] = args.load_path approximator_params['folder_name'] = args.load_path folder_name = args.load_path p = "scores_" + str(ts) + ".npy" scores = np.load(p).tolist() max_steps = max_steps - evaluation_frequency * len(scores) approximator = SimpleNet # Agent algorithm_params = dict( batch_size=args.batch_size, initial_replay_size=initial_replay_size, max_replay_size=max_replay_size, clip_reward=False, target_update_frequency=target_update_frequency // train_frequency, ) if args.alg == 'boot': algorithm_params['p_mask'] = args.p_mask elif args.alg in ['particle', 'gaussian']: algorithm_params['update_type'] = args.update_type algorithm_params['delta'] = args.delta algorithm_params['store_prob'] = args.store_prob if args.clip_target: algorithm_params['max_spread'] = args.q_max - args.q_min approximator_params['q_min'] = args.q_min approximator_params['q_max'] = args.q_max approximator_params['loss'] = args.loss approximator_params['init_type'] = args.init_type if args.alg in ['boot', 'particle']: approximator_params['n_approximators'] = args.n_approximators algorithm_params['n_approximators'] = args.n_approximators agent = agent_algorithm(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) if args.ucb: q = agent.approximator if args.alg == 'particle': def mu(state): q_list = q.predict(state).squeeze() qs = np.array(q_list) return qs.mean(axis=0) quantiles = [ i * 1. / (args.n_approximators - 1) for i in range(args.n_approximators) ] for p in range(args.n_approximators): if quantiles[p] >= 1 - args.delta: delta_index = p break def quantile_func(state): q_list = q.predict(state).squeeze() qs = np.sort(np.array(q_list), axis=0) return qs[delta_index, :] print("Setting up ucb policy") pi.set_mu(mu) pi.set_quantile_func(quantile_func) if args.alg == 'gaussian': standard_bound = norm.ppf(1 - args.delta, loc=0, scale=1) def mu(state): q_and_sigma = q.predict(state).squeeze() means = q_and_sigma[0] return means def quantile_func(state): q_and_sigma = q.predict(state).squeeze() means = q_and_sigma[0] sigmas = q_and_sigma[1] return sigmas * standard_bound + means print("Setting up ucb policy") pi.set_mu(mu) pi.set_quantile_func(quantile_func) args.count = 100 if args.plot_qs: import matplotlib.pyplot as plt colors = ['red', 'blue', 'green'] labels = ['left', 'nop', 'right'] def plot_probs(qs): args.count += 1 if args.count < 1: return ax.clear() for i in range(qs.shape[-1]): mu = np.mean(qs[..., i], axis=0) sigma = np.std(qs[..., i], axis=0) x = np.linspace(mu - 3 * sigma, mu + 3 * sigma, 20) ax.plot(x, stats.norm.pdf(x, mu, sigma), label=labels[i], color=colors[i]) ax.set_xlabel('Q-value') ax.set_ylabel('Probability') ax.set_title('Q-distributions') #ax.set_ylim(bottom=0, top=1) plt.draw() plt.pause(0.02) #print("Plotted") args.count = 0 #return probs plt.ion() fig, ax = plt.subplots() plot_probs( np.array(agent.approximator.predict(np.array(mdp.reset())))) input() args.count = 100 qs = np.array([ np.linspace(-1000, 0, 10), np.linspace(-2000, -1000, 10), np.linspace(-750, -250, 10) ]) plot_probs(qs.T) # Algorithm core = Core(agent, mdp) core_test = Core(agent, mdp) # RUN # Fill replay memory with random dataset print_epoch(0) core.learn( n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size, quiet=args.quiet, ) if args.save: agent.approximator.model.save() # Evaluate initial policy if hasattr(pi, 'set_eval'): pi.set_eval(True) pi.set_epsilon(epsilon_test) dataset = core_test.evaluate(n_steps=test_samples, render=args.render, quiet=args.quiet) scores.append(get_stats(dataset)) if args.plot_qs: pi.set_plotter(plot_probs) np.save(folder_name + '/scores_' + str(ts) + '.npy', scores) for n_epoch in range(1, max_steps // evaluation_frequency + 1): print_epoch(n_epoch) print('- Learning:') # learning step if hasattr(pi, 'set_eval'): pi.set_eval(False) pi.set_epsilon(epsilon) # learning step if args.plot_qs: pi.set_plotter(None) core.learn( n_steps=evaluation_frequency, n_steps_per_fit=train_frequency, quiet=args.quiet, ) if args.save: agent.approximator.model.save() print('- Evaluation:') # evaluation step if hasattr(pi, 'set_eval'): pi.set_eval(True) pi.set_epsilon(epsilon_test) if args.plot_qs: pi.set_plotter(plot_probs) dataset = core_test.evaluate(n_steps=test_samples, render=args.render, quiet=args.quiet) scores.append(get_stats(dataset)) np.save(folder_name + '/scores_' + str(ts) + '.npy', scores) return scores
def experiment(algorithm, name, update_mode, update_type, policy, n_approximators, q_max, q_min, lr_exp, double, file_name, out_dir, collect_qs, seed): set_global_seeds(seed) print('Using seed %s' % seed) # MDP if name == 'Taxi': mdp = generate_taxi('../../grid.txt', horizon=5000) max_steps = 500000 evaluation_frequency = 5000 test_samples = 5000 elif name == 'Chain': mdp = generate_chain(horizon=100) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'Loop': mdp = generate_loop(horizon=100) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'RiverSwim': mdp = generate_river(horizon=100) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'SixArms': mdp = generate_arms(horizon=100) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'KnightQuest': mdp = Gym('KnightQuest-v0', gamma=0.99, horizon=10000) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 else: raise NotImplementedError epsilon_test = Parameter(0) # Agent learning_rate = ExponentialDecayParameter(value=1., decay_exp=lr_exp, size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) if algorithm == 'ql': if policy not in ['boltzmann', 'eps-greedy']: warnings.warn( 'QL available with only boltzmann and eps-greedy policies!') policy = 'eps-greedy' if policy == 'eps-greedy': epsilon_train = ExponentialDecayParameter( value=1., decay_exp=.5, size=mdp.info.observation_space.size) pi = policy_dict[policy](epsilon=epsilon_train) else: beta_train = ExponentialDecayParameter( value=1.5 * q_max, decay_exp=.5, size=mdp.info.observation_space.size) pi = policy_dict[policy](beta=beta_train) if double: agent = DoubleQLearning(pi, mdp.info, **algorithm_params) else: agent = QLearning(pi, mdp.info, **algorithm_params) elif algorithm == 'boot-ql': if policy not in ['boot', 'weighted']: warnings.warn( 'Bootstrapped QL available with only boot and weighted policies!' ) policy = 'boot' pi = policy_dict[policy](n_approximators=n_approximators) algorithm_params = dict(n_approximators=n_approximators, mu=(q_max + q_min) / 2, sigma=q_max - q_min, **algorithm_params) if double: agent = BootstrappedDoubleQLearning(pi, mdp.info, **algorithm_params) else: agent = BootstrappedQLearning(pi, mdp.info, **algorithm_params) epsilon_train = Parameter(0) elif algorithm == 'particle-ql': if policy not in ['weighted', 'vpi']: warnings.warn( 'Particle QL available with only vpi and weighted policies!') policy = 'weighted' pi = policy_dict[policy](n_approximators=n_approximators) algorithm_params = dict(n_approximators=n_approximators, update_mode=update_mode, update_type=update_type, q_max=q_max, q_min=q_min, **algorithm_params) if double: agent = ParticleDoubleQLearning(pi, mdp.info, **algorithm_params) else: agent = ParticleQLearning(pi, mdp.info, **algorithm_params) epsilon_train = Parameter(0) else: raise ValueError() # Algorithm collect_dataset = CollectDataset() collect_qs_callback = CollectQs(agent.approximator) callbacks = [collect_dataset] if collect_qs: callbacks += [collect_qs_callback] core = Core(agent, mdp, callbacks) train_scores = [] test_scores = [] for n_epoch in range(1, max_steps // evaluation_frequency + 1): # Train if hasattr(pi, 'set_epsilon'): pi.set_epsilon(epsilon_train) if hasattr(pi, 'set_eval'): pi.set_eval(False) core.learn(n_steps=evaluation_frequency, n_steps_per_fit=1, quiet=True) dataset = collect_dataset.get() scores = compute_scores(dataset, mdp.info.gamma) # print('Train: ', scores) train_scores.append(scores) collect_dataset.clean() mdp.reset() if hasattr(pi, 'set_epsilon'): pi.set_epsilon(epsilon_test) if hasattr(pi, 'set_eval'): pi.set_eval(True) dataset = core.evaluate(n_steps=test_samples, quiet=True) mdp.reset() scores = compute_scores(dataset, mdp.info.gamma) # print('Evaluation: ', scores) test_scores.append(scores) if collect_qs: qs = collect_qs_callback.get_values() if not os.path.exists(out_dir): os.makedirs(out_dir) np.save(out_dir + '/' + file_name, qs) return train_scores, test_scores
def experiment(algorithm, name, update_mode, update_type, policy, n_approximators, q_max, q_min, lr_exp, file_name, out_dir, particles, R=1, m=1, collect_qs=False, seed=0): set_global_seeds(seed) print('Using seed %s' % seed) # MDP if name == 'Taxi': mdp = generate_taxi('../../grid.txt', horizon=5000) max_steps = 500000 evaluation_frequency = 5000 test_samples = 5000 elif name == 'Chain': mdp = generate_chain(horizon=100) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'Loop': mdp = generate_loop(horizon=100) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'RiverSwim': mdp = generate_river(horizon=100) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'SixArms': mdp = generate_arms(horizon=100) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'KnightQuest': mdp = None try: mdp = Gym('KnightQuest-v0', gamma=0.99, horizon=10000) except: register( id='KnightQuest-v0', entry_point='envs.knight_quest:KnightQuest', ) mdp = Gym('KnightQuest-v0', gamma=0.99, horizon=10000) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 else: raise NotImplementedError epsilon_test = Parameter(0) # Agent learning_rate = ExponentialDecayParameter(value=1., decay_exp=lr_exp, size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) if algorithm == 'particle-ql': delta = 0.1 if policy not in ['weighted', 'vpi', 'ucb']: warnings.warn( 'Particle QL available with only vpi and weighted policies!') policy = 'weighted' if policy == 'ucb': pi = UCBPolicy(delta=delta, q_max=R / (1 - mdp.info.gamma)) else: pi = policy_dict[policy](n_approximators=n_approximators) algorithm_params = dict(n_approximators=n_approximators, update_mode=update_mode, update_type=update_type, q_max=q_max, q_min=q_min, delta=delta, init_values=particles, **algorithm_params) agent = ParticleQLearning(pi, mdp.info, **algorithm_params) if policy == 'ucb': q = agent.approximator quantiles = [ i * 1. / (n_approximators - 1) for i in range(n_approximators) ] for p in range(n_approximators): if quantiles[p] >= 1 - delta: particle_bound = p break def quantile_func(state, quantile): q_list = list() for i in range(n_approximators): q_list.append(q.predict(state, idx=i)) qs = np.array(q_list) out = np.zeros(qs.shape[1]) out[:] = qs[particle_bound, :] return out def mu(state): q_list = list() for i in range(n_approximators): q_list.append(q.predict(state, idx=i)) qs = np.array(q_list) return np.mean(qs, axis=0) pi.set_quantile_func(quantile_func) pi.set_mu(mu) epsilon_train = Parameter(0) elif algorithm == 'delayed-ql': algorithm_params = dict(R=R, m=m, **algorithm_params) agent = DelayedQLearning(mdp.info, **algorithm_params) pi = agent # Algorithm collect_dataset = CollectDataset() callbacks = [collect_dataset] if collect_qs: collect_qs_callback = CollectQs(agent.approximator) callbacks += [collect_qs_callback] core = Core(agent, mdp, callbacks) train_scores = [] test_scores = [] for n_epoch in range(1, max_steps // evaluation_frequency + 1): # Train if hasattr(pi, 'set_epsilon'): pi.set_epsilon(epsilon_train) if hasattr(pi, 'set_eval'): pi.set_eval(False) core.learn(n_steps=evaluation_frequency, n_steps_per_fit=1, quiet=True) dataset = collect_dataset.get() scores = compute_scores(dataset, mdp.info.gamma) # print('Train: ', scores) train_scores.append(scores) collect_dataset.clean() mdp.reset() if hasattr(pi, 'set_epsilon'): pi.set_epsilon(epsilon_test) if hasattr(pi, 'set_eval'): pi.set_eval(True) dataset = core.evaluate(n_steps=test_samples, quiet=True) mdp.reset() scores = compute_scores(dataset, mdp.info.gamma) # print('Evaluation: ', scores) test_scores.append(scores) if collect_qs: qs = collect_qs_callback.get_values() if not os.path.exists(out_dir): os.makedirs(out_dir) np.save(out_dir + '/' + file_name, qs) return train_scores, test_scores
def experiment(algorithm, name, update_mode, update_type, policy, n_approximators, q_max, q_min, lr_exp, R, log_lr, r_max_m, delayed_m, delayed_epsilon, delta, debug, double, regret_test, a, b, mbie_C, value_iterations, tolerance, file_name, out_dir, collect_qs, seed): set_global_seeds(seed) print('Using seed %s' % seed) # MDP if name == 'Taxi': mdp = generate_taxi('../grid.txt', horizon=5000, gamma=0.99) max_steps = 500000 evaluation_frequency = 5000 test_samples = 5000 elif name == 'Chain': mdp = generate_chain(horizon=100, gamma=0.99) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'Gridworld': mdp = generate_gridworld(horizon=100, gamma=0.99) max_steps = 500000 evaluation_frequency = 5000 test_samples = 1000 elif name == 'Loop': mdp = generate_loop(horizon=100, gamma=0.99) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'RiverSwim': mdp = generate_river(horizon=100, gamma=0.99) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 mbie_C = 0.4 elif name == 'SixArms': mdp = generate_arms(horizon=100, gamma=0.99) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 mbie_C = 0.8 elif name == 'ThreeArms': horizon = 100 mdp = generate_three_arms(horizon=horizon, gamma=0.99) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'KnightQuest': mdp = None try: mdp = Gym('KnightQuest-v0', gamma=0.99, horizon=10000) except: register( id='KnightQuest-v0', entry_point='envs.knight_quest:KnightQuest', ) mdp = Gym('KnightQuest-v0', gamma=0.99, horizon=10000) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 else: raise NotImplementedError epsilon_test = Parameter(0) # Agent learning_rate = ExponentialDecayParameter(value=1., decay_exp=lr_exp, size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) if regret_test: max_steps = int(args.max_steps_regret * 1e6) evaluation_frequency = max_steps // 100 test_samples = 1000 if debug: max_steps = 100000 evaluation_frequency = max_steps // 100 test_samples = 1000 if algorithm == 'ql': if policy not in ['boltzmann', 'eps-greedy']: warnings.warn('QL available with only boltzmann and eps-greedy policies!') policy = 'eps-greedy' if policy == 'eps-greedy': epsilon_train = ExponentialDecayParameter(value=1., decay_exp=lr_exp, size=mdp.info.observation_space.size) pi = policy_dict[policy](epsilon=epsilon_train) else: beta_train = ExponentialDecayParameter(value=1.5 * q_max, decay_exp=.5, size=mdp.info.observation_space.size) pi = policy_dict[policy](beta=beta_train) if double: agent = DoubleQLearning(pi, mdp.info, **algorithm_params) else: agent = QLearning(pi, mdp.info, **algorithm_params) elif algorithm == 'boot-ql': if policy not in ['boot', 'weighted']: warnings.warn('Bootstrapped QL available with only boot and weighted policies!') policy = 'boot' pi = policy_dict[policy](n_approximators=n_approximators) algorithm_params = dict(n_approximators=n_approximators, mu=(q_max + q_min) / 2, sigma=(q_max - q_min)/2, **algorithm_params) if double: agent = BootstrappedDoubleQLearning(pi, mdp.info, **algorithm_params) else: agent = BootstrappedQLearning(pi, mdp.info, **algorithm_params) epsilon_train = Parameter(0) elif algorithm == 'particle-ql': if policy not in ['weighted', 'ucb']: warnings.warn('Particle QL available with only ucb and weighted policies!') policy = 'weighted' if policy == 'ucb': pi = UCBPolicy(delta=delta, q_max=R/(1-mdp.info.gamma)) else: pi = policy_dict[policy](n_approximators=n_approximators) algorithm_params = dict(n_approximators=n_approximators, update_mode=update_mode, update_type=update_type, q_max=q_max, q_min=q_min, delta=delta, **algorithm_params) if double: agent = ParticleDoubleQLearning(pi, mdp.info, **algorithm_params) else: agent = ParticleQLearning(pi, mdp.info, **algorithm_params) epsilon_train = Parameter(0) elif algorithm == 'r-max': thr_1 = int(np.ceil((4 * mdp.info.size[0] * 1.0/(1-mdp.info.gamma) * R )**3)) algorithm_params = dict( rmax=R, s_a_threshold=r_max_m ) agent = RMaxAgent(mdp.info, **algorithm_params) pi = agent epsilon_train = Parameter(0) elif algorithm == 'mbie': algorithm_params = dict( rmax=R, C=mbie_C, value_iterations=value_iterations, tolerance=tolerance ) agent = MBIE_EB(mdp.info, **algorithm_params) pi = agent epsilon_train = Parameter(0) elif algorithm == 'delayed-ql': theoretic_m = delayed_m if regret_test: gamma = mdp.info.gamma Vmax = R / (1 - gamma) epsilon = args.delayed_ratio * Vmax delayed_epsilon = epsilon*(1-gamma) delta = 0.1 S, A = mdp.info.size theoretic_m = (1 + gamma*Vmax)**2 / (2*delayed_epsilon**2) * np.log(3*S*A/delta * (1 + S*A/(delayed_epsilon*(1-gamma)))) if debug: print("Delta:{}".format(delta)) print("R:{}".format(R)) print("Vmax:{}".format(Vmax)) print("Gamma:{}".format(mdp.info.gamma)) print("Epsilon:{}".format(epsilon)) #print("k:{}".format(k)) print("m:{}".format(theoretic_m)) print("S:{}".format(S)) print("A:{}".format(A)) input() def evaluate_policy(P, R, policy): P_pi = np.zeros((S, S)) R_pi = np.zeros(S) for s in range(S): for s1 in range(S): P_pi[s,s1] = np.sum(policy[s, :] * P[s, :, s1]) R_pi[s] = np.sum(policy[s, :] * np.sum(P[s, :, :] * R[s, :, :], axis=-1)) I = np.diag(np.ones(S)) V = np.linalg.solve(I - gamma * P_pi, R_pi) return V algorithm_params = dict( R=R, m=theoretic_m, delta=delta, epsilon=delayed_epsilon, **algorithm_params) agent = DelayedQLearning(mdp.info, **algorithm_params) if regret_test: collect_vs_callback = CollectVs(mdp, agent, evaluate_policy, args.freq_collection) if debug: print("Q:") print(agent.get_approximator()[:, :]) print("Policy:") print(agent.get_policy()) print("V:{}".format(evaluate_policy(mdp.p,mdp.r,agent.get_policy()))) input() pi = agent epsilon_train = Parameter(0) elif algorithm == 'gaussian-ql': if policy not in ['weighted-gaussian', 'ucb']: warnings.warn('Particle QL available with only ucb and weighted policies!') policy = 'weighted-gaussian' if policy == 'ucb': pi = UCBPolicy(delta=delta, q_max=R/(1-mdp.info.gamma)) else: pi = policy_dict[policy]() q_0 = (q_max - q_min) / 2 sigma_0 = (q_max - q_min) / np.sqrt(12) C = 2 * R / (np.sqrt(2 * np.pi) * (1 - mdp.info.gamma) * sigma_0) sigma_lr = None if log_lr: sigma_lr = LogarithmicDecayParameter(value=1., C=C, size=mdp.info.size) init_values = (q_0, sigma_0) if regret_test: sigma_lr = None gamma = mdp.info.gamma T = max_steps S, A = mdp.info.size a = (2 + gamma) / (2 *(1 - gamma)) b = a - 1 c = 1 d = b q_max = R / (1 - gamma) standard_bound = norm.ppf(1 - delta, loc=0, scale=1) #first_fac = np.sqrt(b + T) #second_fac = np.sqrt(a * np.log(S*A*T / delta)) #sigma2_factor = min(np.sqrt(b + T), np.sqrt(a * np.log(S*A*T / delta))) q_0 = q_max sigma1_0 = 0 #sigma2_0 = (R + gamma * q_max) / (standard_bound * np.sqrt(c-1)) * sigma2_factor sigma2_0 = (gamma * q_max) / (c * standard_bound) * np.sqrt(a * np.log(S * A * T / delta)) init_values = (q_0, sigma1_0, sigma2_0) learning_rate = TheoreticalParameter(a=a, b=b, decay_exp=1, size=mdp.info.size) learning_rate_sigma1 = TheoreticalParameter(a=a, b=b, decay_exp=1, size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate, sigma_1_learning_rate=learning_rate_sigma1) sigma_lr = BetaParameter(c=c, d=d, size=mdp.info.size) def evaluate_policy(P, R, policy): P_pi = np.zeros((S, S)) R_pi = np.zeros(S) for s in range(S): for s1 in range(S): P_pi[s,s1] = np.sum(policy[s, :] * P[s, :, s1]) R_pi[s] = np.sum(policy[s, :] * np.sum(P[s, :, :] * R[s, :, :],axis=-1)) I = np.diag(np.ones(S)) V = np.linalg.solve(I - gamma * P_pi, R_pi) return V if debug: print("Delta:{}".format(delta)) print("R:{}".format(R)) print("Gamma:{}".format(mdp.info.gamma)) print("mu0:{}".format(q_0)) print("Sigma1_0:{}".format(sigma1_0)) print("Sigma2_0:{}".format(sigma2_0)) print("a:{}".format(a)) print("b:{}".format(b)) print("c:{}".format(c)) print("d:{}".format(d)) print("T:{}".format(T)) print("S:{}".format(S)) print("A:{}".format(A)) input() algorithm_params = dict( update_mode=update_mode, update_type=update_type, sigma_learning_rate=sigma_lr, init_values=init_values, delta=delta, q_max=q_max, **algorithm_params) if double and not regret_test: agent = GaussianDoubleQLearning(pi, mdp.info, **algorithm_params) else: agent = GaussianQLearning(pi, mdp.info, **algorithm_params) if regret_test: if debug: freq = 10 else: freq = args.freq_collection collect_vs_callback = CollectVs(mdp, agent, evaluate_policy, freq) if debug: print("Policy:") print(agent.get_policy()) print("Q") for state in range(S): means = np.array(agent.approximator.predict(np.array([state]), idx=0)) sigmas1 = np.array(agent.approximator.predict(np.array([state]), idx=1)) sigmas2 = np.array(agent.approximator.predict(np.array([state]), idx=2)) print("Means:{}".format(means)) print("Sigmas1:{}".format(sigmas1)) print("Sigmas2:{}".format(sigmas2)) print("V:{}".format(evaluate_policy(mdp.p,mdp.r,agent.get_policy()))) input() if policy == 'ucb': q = agent.approximator standard_bound = norm.ppf(1 - delta, loc=0, scale=1) def quantile_func(state): means = np.array(q.predict(state, idx=0)) if regret_test: sigmas1 = np.array(q.predict(state, idx=1)) sigmas2 = np.array(q.predict(state, idx=2)) sigmas = sigmas2 #print(sigmas1, sigmas2) else: sigmas = np.array(q.predict(state, idx=1)) out = sigmas * standard_bound + means return out def mu(state): q_list = q.predict(state, idx=0) means = np.array(q_list) return means pi.set_quantile_func(quantile_func) pi.set_mu(mu) epsilon_train = Parameter(0) else: raise ValueError() # Algorithm collect_dataset = CollectDataset() callbacks = [collect_dataset] if collect_qs: if algorithm not in ['r-max']: collect_qs_callback = CollectQs(agent.approximator) callbacks += [collect_qs_callback] if regret_test: callbacks += [collect_vs_callback] core = Core(agent, mdp, callbacks) train_scores = [] test_scores = [] for n_epoch in range(1, max_steps // evaluation_frequency + 1): # Train if hasattr(pi, 'set_epsilon'): pi.set_epsilon(epsilon_train) if hasattr(pi, 'set_eval'): pi.set_eval(False) if regret_test: collect_vs_callback.on() core.learn(n_steps=evaluation_frequency, n_steps_per_fit=1, quiet=True) dataset = collect_dataset.get() scores = compute_scores(dataset, mdp.info.gamma) #print('Train: ', scores) train_scores.append(scores) collect_dataset.clean() mdp.reset() if regret_test: vs = collect_vs_callback.get_values() if not os.path.exists(out_dir): os.makedirs(out_dir) print("Finished {} steps.".format(n_epoch * evaluation_frequency)) np.save(out_dir + "/vs_" + algorithm+"_"+str(seed), vs) np.save(out_dir+"/scores_online" + str(seed), train_scores) collect_vs_callback.off() if hasattr(pi, 'set_epsilon'): pi.set_epsilon(epsilon_test) if hasattr(pi, 'set_eval'): pi.set_eval(True) dataset = core.evaluate(n_steps=test_samples, quiet=True) s = mdp.reset() scores = compute_scores(dataset, mdp.info.gamma) print('Evaluation #%d:%s ' %(n_epoch, scores)) if debug: print("Policy:") print(agent.get_policy()) print("Q") for state in range(S): means = np.array(agent.approximator.predict(np.array([state]), idx=0)) sigmas1 = np.array(agent.approximator.predict(np.array([state]), idx=1)) sigmas2 = np.array(agent.approximator.predict(np.array([state]), idx=2)) print("Means:{}".format(means)) print("Sigmas1:{}".format(sigmas1)) print("Sigmas2:{}".format(sigmas2)) print("V:{}".format(evaluate_policy(mdp.p, mdp.r, agent.get_policy()))) input() test_scores.append(scores) if regret_test: np.save(out_dir + "/scores_offline" + str(seed), test_scores) if collect_qs: qs= collect_qs_callback.get_values() if not os.path.exists(out_dir): os.makedirs(out_dir) np.save(out_dir + '/' + file_name, qs) return train_scores, test_scores
def experiment(policy, name, folder_name): np.random.seed() # Argument parser parser = argparse.ArgumentParser() arg_mdp = parser.add_argument_group('Environment') arg_mdp.add_argument("--horizon", type=int) arg_mdp.add_argument("--gamma", type=float) arg_mem = parser.add_argument_group('Replay Memory') arg_mem.add_argument("--initial-replay-size", type=int, default=100, help='Initial size of the replay memory.') arg_mem.add_argument("--max-replay-size", type=int, default=5000, help='Max size of the replay memory.') arg_net = parser.add_argument_group('Deep Q-Network') arg_net.add_argument("--n-features", type=int, default=80) arg_net.add_argument( "--optimizer", choices=['adadelta', 'adam', 'rmsprop', 'rmspropcentered'], default='adam', help='Name of the optimizer to use to learn.') arg_net.add_argument("--learning-rate", type=float, default=.0001, help='Learning rate value of the optimizer. Only used' 'in rmspropcentered') arg_net.add_argument("--decay", type=float, default=.95, help='Discount factor for the history coming from the' 'gradient momentum in rmspropcentered') arg_net.add_argument("--epsilon", type=float, default=.01, help='Epsilon term used in rmspropcentered') arg_alg = parser.add_argument_group('Algorithm') arg_alg.add_argument("--weighted-update", action='store_true') arg_alg.add_argument( "--n-approximators", type=int, default=10, help="Number of approximators used in the ensemble for" "Averaged DQN.") arg_alg.add_argument("--batch-size", type=int, default=100, help='Batch size for each fit of the network.') arg_alg.add_argument("--history-length", type=int, default=1, help='Number of frames composing a state.') arg_alg.add_argument("--target-update-frequency", type=int, default=100, help='Number of collected samples before each update' 'of the target network.') arg_alg.add_argument("--evaluation-frequency", type=int, default=1000, help='Number of learning step before each evaluation.' 'This number represents an epoch.') arg_alg.add_argument("--train-frequency", type=int, default=1, help='Number of learning steps before each fit of the' 'neural network.') arg_alg.add_argument("--max-steps", type=int, default=50000, help='Total number of learning steps.') arg_alg.add_argument( "--final-exploration-frame", type=int, default=1, help='Number of steps until the exploration rate stops' 'decreasing.') arg_alg.add_argument("--initial-exploration-rate", type=float, default=0., help='Initial value of the exploration rate.') arg_alg.add_argument("--final-exploration-rate", type=float, default=0., help='Final value of the exploration rate. When it' 'reaches this values, it stays constant.') arg_alg.add_argument("--test-exploration-rate", type=float, default=0., help='Exploration rate used during evaluation.') arg_alg.add_argument("--test-samples", type=int, default=1000, help='Number of steps for each evaluation.') arg_alg.add_argument("--max-no-op-actions", type=int, default=0, help='Maximum number of no-op action performed at the' 'beginning of the episodes. The minimum number is' 'history_length.') arg_alg.add_argument("--no-op-action-value", type=int, default=0, help='Value of the no-op action.') arg_alg.add_argument("--p-mask", type=float, default=1.) arg_utils = parser.add_argument_group('Utils') arg_utils.add_argument('--load-path', type=str, help='Path of the model to be loaded.') arg_utils.add_argument('--save', action='store_true', help='Flag specifying whether to save the model.') arg_utils.add_argument('--render', action='store_true', help='Flag specifying whether to render the game.') arg_utils.add_argument('--quiet', action='store_true', help='Flag specifying whether to hide the progress' 'bar.') arg_utils.add_argument('--debug', action='store_true', help='Flag specifying whether the script has to be' 'run in debug mode.') args = parser.parse_args() scores = list() # Evaluation of the model provided by the user. if args.load_path: # MDP if name != 'Taxi': mdp = Gym(name, args.horizon, args.gamma) n_states = None gamma_eval = 1. else: mdp = generate_taxi('../../grid.txt') n_states = mdp.info.observation_space.size[0] gamma_eval = mdp.info.gamma # Policy epsilon_test = Parameter(value=args.test_exploration_rate) pi = BootPolicy(args.n_approximators, epsilon=epsilon_test) # Approximator input_shape = mdp.info.observation_space.shape + ( args.history_length, ) input_preprocessor = list() approximator_params = dict(input_shape=input_shape, output_shape=(mdp.info.action_space.n, ), n_states=n_states, n_actions=mdp.info.action_space.n, n_features=args.n_features, n_approximators=args.n_approximators, input_preprocessor=input_preprocessor, name='test', load_path=args.load_path, optimizer={ 'name': args.optimizer, 'lr': args.learning_rate, 'decay': args.decay, 'epsilon': args.epsilon }) approximator = SimpleNet # Agent algorithm_params = dict(batch_size=0, initial_replay_size=0, max_replay_size=0, history_length=1, clip_reward=False, n_approximators=args.n_approximators, train_frequency=1, target_update_frequency=1, max_no_op_actions=args.max_no_op_actions, no_op_action_value=args.no_op_action_value, p_mask=args.p_mask, weighted_update=args.weighted_update) agent = DoubleDQN(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) # Algorithm core_test = Core(agent, mdp) # Evaluate model pi.set_eval(True) dataset = core_test.evaluate(n_steps=args.test_samples, render=args.render, quiet=args.quiet) get_stats(dataset, gamma_eval) else: # DQN learning run # Settings if args.debug: initial_replay_size = 50 max_replay_size = 500 train_frequency = 5 target_update_frequency = 10 test_samples = 20 evaluation_frequency = 50 max_steps = 1000 else: initial_replay_size = args.initial_replay_size max_replay_size = args.max_replay_size train_frequency = args.train_frequency target_update_frequency = args.target_update_frequency test_samples = args.test_samples evaluation_frequency = args.evaluation_frequency max_steps = args.max_steps # MDP if name != 'Taxi': mdp = Gym(name, args.horizon, args.gamma) n_states = None gamma_eval = 1. else: mdp = generate_taxi('../../grid.txt') n_states = mdp.info.observation_space.size[0] gamma_eval = mdp.info.gamma # Policy epsilon = LinearDecayParameter(value=args.initial_exploration_rate, min_value=args.final_exploration_rate, n=args.final_exploration_frame) epsilon_test = Parameter(value=args.test_exploration_rate) epsilon_random = Parameter(value=1.) if policy == 'boot': pi = BootPolicy(args.n_approximators, epsilon=epsilon_random) elif policy == 'weighted': pi = WeightedPolicy(args.n_approximators, epsilon=epsilon_random) else: raise ValueError # Approximator input_shape = mdp.info.observation_space.shape + ( args.history_length, ) input_preprocessor = list() approximator_params = dict(input_shape=input_shape, output_shape=(mdp.info.action_space.n, ), n_states=n_states, n_actions=mdp.info.action_space.n, n_features=args.n_features, n_approximators=args.n_approximators, input_preprocessor=input_preprocessor, folder_name=folder_name, optimizer={ 'name': args.optimizer, 'lr': args.learning_rate, 'decay': args.decay, 'epsilon': args.epsilon }) approximator = SimpleNet # Agent algorithm_params = dict( batch_size=args.batch_size, initial_replay_size=initial_replay_size, max_replay_size=max_replay_size, history_length=args.history_length, clip_reward=False, n_approximators=args.n_approximators, train_frequency=train_frequency, target_update_frequency=target_update_frequency, max_no_op_actions=args.max_no_op_actions, no_op_action_value=args.no_op_action_value, p_mask=args.p_mask, weighted_update=args.weighted_update) agent = DoubleDQN(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) # Algorithm core = Core(agent, mdp) core_test = Core(agent, mdp) # RUN # Fill replay memory with random dataset print_epoch(0) core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size, quiet=args.quiet) if args.save: agent.approximator.model.save() # Evaluate initial policy pi.set_eval(True) pi.set_epsilon(epsilon_test) dataset = core_test.evaluate(n_steps=test_samples, render=args.render, quiet=args.quiet) scores.append(get_stats(dataset, gamma_eval)) for n_epoch in range(1, max_steps // evaluation_frequency + 1): print_epoch(n_epoch) print('- Learning:') pi.set_eval(False) pi.set_epsilon(epsilon) # learning step core.learn(n_steps=evaluation_frequency, n_steps_per_fit=train_frequency, quiet=args.quiet) if args.save: agent.approximator.model.save() print('- Evaluation:') # evaluation step core_test.reset() pi.set_eval(True) pi.set_epsilon(epsilon_test) dataset = core_test.evaluate(n_steps=test_samples, render=args.render, quiet=args.quiet) scores.append(get_stats(dataset, gamma_eval)) return scores