def experiment(): np.random.seed() # MDP mdp = generate_simple_chain(state_n=5, goal_states=[2], prob=.8, rew=1, gamma=.9) # Policy epsilon = Parameter(value=.15) pi = EpsGreedy(epsilon=epsilon, ) # Agent learning_rate = Parameter(value=.2) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = QLearning(pi, mdp.info, agent_params) # Algorithm core = Core(agent, mdp) # Train core.learn(n_steps=10000, n_steps_per_fit=1)
def experiment(algorithm_class): np.random.seed(20) # MDP mdp = GridWorldVanHasselt() # Policy epsilon = ExponentialDecayParameter(value=1, decay_exp=.5, size=mdp.info.observation_space.size) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = ExponentialDecayParameter(value=1., decay_exp=1., size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = {'algorithm_params': algorithm_params, 'fit_params': fit_params} agent = algorithm_class(pi, mdp.info, agent_params) # Algorithm collect_dataset = CollectDataset() start = mdp.convert_to_int(mdp._start, mdp._width) collect_max_Q = CollectMaxQ(agent.approximator, start) callbacks = [collect_dataset, collect_max_Q] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=2000, n_steps_per_fit=1, quiet=True) _, _, reward, _, _, _ = parse_dataset(collect_dataset.get()) max_Qs = collect_max_Q.get_values() return reward, max_Qs
def experiment(algorithm_class, decay_exp): np.random.seed(3) # MDP p = np.load('tests/double_chain/chain_structure/p.npy') rew = np.load('tests/double_chain/chain_structure/rew.npy') mdp = FiniteMDP(p, rew, gamma=.9) # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = ExponentialDecayParameter(value=1., decay_exp=decay_exp, size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = {'algorithm_params': algorithm_params, 'fit_params': fit_params} agent = algorithm_class(pi, mdp.info, agent_params) # Algorithm collect_Q = CollectQ(agent.approximator) callbacks = [collect_Q] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=2000, n_steps_per_fit=1, quiet=True) Qs = collect_Q.get_values() return Qs
def experiment2(): np.random.seed(3) print('mushroom :') # MDP mdp = generate_simple_chain(state_n=5, goal_states=[2], prob=.8, rew=1, gamma=.9) # Policy epsilon = Parameter(value=.15) pi = EpsGreedy(epsilon=epsilon, ) # Agent learning_rate = Parameter(value=.2) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = QLearning(pi, mdp.info, agent_params) # Algorithm collect_dataset = CollectDataset() callbacks = [collect_dataset] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) dataset = collect_dataset.get() return agent.Q.table
def experiment_others(alg, decay_exp): np.random.seed() # MDP grid_map = "simple_gridmap.txt" mdp = GridWorldGenerator(grid_map=grid_map) # Policy epsilon = ExponentialDecayParameter(value=1, decay_exp=.5, size=mdp.info.observation_space.size) pi = EpsGreedy(epsilon=epsilon) # Agent alpha = ExponentialDecayParameter(value=1, decay_exp=decay_exp, size=mdp.info.size) algorithm_params = dict(learning_rate=alpha) fit_params = dict() agent_params = {'algorithm_params': algorithm_params, 'fit_params': fit_params} agent = alg(pi, mdp.info, agent_params) # Algorithm collect_max_Q = CollectMaxQ(agent.Q, mdp.convert_to_int(mdp._start, mdp._width)) collect_dataset = CollectDataset() callbacks = [collect_max_Q, collect_dataset] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True) _, _, reward, _, _, _ = parse_dataset(collect_dataset.get()) max_Qs = collect_max_Q.get_values() return reward, max_Qs
def experiment(boosted): np.random.seed(20) # MDP mdp = CarOnHill() # Policy epsilon = Parameter(value=1) pi = EpsGreedy(epsilon=epsilon) # Approximator if not boosted: approximator_params = dict( input_shape=mdp.info.observation_space.shape, n_actions=mdp.info.action_space.n, n_estimators=50, min_samples_split=5, min_samples_leaf=2) else: approximator_params = dict( input_shape=mdp.info.observation_space.shape, n_actions=mdp.info.action_space.n, n_models=3, prediction='sum', n_estimators=50, min_samples_split=5, min_samples_leaf=2) approximator = ExtraTreesRegressor # Agent algorithm_params = dict(n_iterations=3, boosted=boosted, quiet=True) fit_params = dict() agent_params = { 'approximator_params': approximator_params, 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = FQI(approximator, pi, mdp.info, agent_params) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=50, n_episodes_per_fit=50, quiet=True) # Test test_epsilon = Parameter(0) agent.policy.set_epsilon(test_epsilon) initial_states = np.zeros((9, 2)) cont = 0 for i in range(-8, 9, 8): for j in range(-8, 9, 8): initial_states[cont, :] = [0.125 * i, 0.375 * j] cont += 1 dataset = core.evaluate(initial_states=initial_states, quiet=True) return np.mean(compute_J(dataset, mdp.info.gamma))
def experiment(n_iterations, n_runs, ep_per_run, use_tensorflow): np.random.seed() # MDP mdp = ShipSteering() # Policy if use_tensorflow: tensor_list = gaussian_tensor.generate( [3, 3, 6, 2], [[0., 150.], [0., 150.], [-np.pi, np.pi], [-np.pi / 12, np.pi / 12]]) phi = Features(tensor_list=tensor_list, name='phi', input_dim=mdp.info.observation_space.shape[0]) else: basis = GaussianRBF.generate([3, 3, 6, 2], [[0., 150.], [0., 150.], [-np.pi, np.pi], [-np.pi / 12, np.pi / 12]]) phi = Features(basis_list=basis) input_shape = (phi.size, ) approximator_params = dict(input_dim=phi.size) approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape, params=approximator_params) #sigma = Parameter(value=.05) #policy = GaussianPolicy(mu=approximator, sigma=sigma) sigma = np.array([[.05]]) policy = MultivariateGaussianPolicy(mu=approximator, sigma=sigma) # Agent learning_rate = AdaptiveParameter(value=.01) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = REINFORCE(policy, mdp.info, agent_params, phi) # Train core = Core(agent, mdp) for i in xrange(n_runs): core.learn(n_episodes=n_iterations * ep_per_run, n_steps_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at iteration ' + str(i) + ': ' + str(np.mean(J))) np.save('ship_steering.npy', dataset_eval)
def experiment1(decay_exp, beta_type): np.random.seed() # MDP p = np.load('p.npy') rew = np.load('rew.npy') mdp = FiniteMDP(p, rew, gamma=.9) # Policy epsilon = Parameter(value=1) pi = EpsGreedy(epsilon=epsilon) # Agent alpha = ExponentialDecayParameter(value=1, decay_exp=decay_exp, size=mdp.info.size) if beta_type == 'Win': beta = WindowedVarianceIncreasingParameter(value=1, size=mdp.info.size, tol=10., window=50) else: beta = VarianceIncreasingParameter(value=1, size=mdp.info.size, tol=10.) algorithm_params = dict(learning_rate=alpha, beta=beta, off_policy=True) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = RQLearning(pi, mdp.info, agent_params) # Algorithm collect_q = CollectQ(agent.Q) collect_lr_1 = CollectParameters(beta, np.array([0])) collect_lr_5 = CollectParameters(beta, np.array([4])) callbacks = [collect_q, collect_lr_1, collect_lr_5] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=20000, n_steps_per_fit=1, quiet=True) Qs = collect_q.get_values() lr_1 = collect_lr_1.get_values() lr_5 = collect_lr_5.get_values() return Qs, lr_1, lr_5
def experiment(decay_exp, windowed, tol): np.random.seed() # MDP mdp = GridWorldVanHasselt() # Policy epsilon = ExponentialDecayParameter(value=1, decay_exp=.5, size=mdp.info.observation_space.size) pi = EpsGreedy(epsilon=epsilon) # Agent alpha = ExponentialDecayParameter(value=1, decay_exp=decay_exp, size=mdp.info.size) if windowed: beta = WindowedVarianceIncreasingParameter(value=1, size=mdp.info.size, tol=tol, window=50) else: beta = VarianceIncreasingParameter(value=1, size=mdp.info.size, tol=tol) algorithm_params = dict(learning_rate=alpha, beta=beta, off_policy=True) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = RQLearning(pi, mdp.info, agent_params) # Algorithm collect_max_Q = CollectMaxQ(agent.Q, mdp.convert_to_int(mdp._start, mdp._width)) collect_dataset = CollectDataset() callbacks = [collect_max_Q, collect_dataset] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True) _, _, reward, _, _, _ = parse_dataset(collect_dataset.get()) max_Qs = collect_max_Q.get_values() return reward, max_Qs
def experiment(alpha): gym.logger.setLevel(0) np.random.seed(386) # MDP mdp = Gym(name='MountainCar-v0', horizon=10000, gamma=1.) mdp.seed(201) # Policy epsilon = Parameter(value=0.) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = Parameter(alpha) tilings = Tiles.generate(10, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) algorithm_params = {'learning_rate': learning_rate, 'lambda': .9} fit_params = dict() agent_params = { 'approximator_params': approximator_params, 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = TrueOnlineSARSALambda(pi, mdp.info, agent_params, features) # Algorithm core = Core(agent, mdp) # Train core.learn(n_steps=2000, n_steps_per_fit=1, quiet=True) # Test test_epsilon = Parameter(0.) agent.policy.set_epsilon(test_epsilon) initial_states = np.array([[0., 0.], [.1, .1]]) dataset = core.evaluate(initial_states=initial_states, quiet=True) return np.mean(compute_J(dataset, 1.))
def experiment(): np.random.seed() # MDP mdp = InvertedPendulum() # Policy epsilon = Parameter(value=0.) pi = EpsGreedy(epsilon=epsilon) # Agent rbfs = GaussianRBF.generate(10, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high) features = Features(basis_list=rbfs) approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) algorithm_params = dict(n_iterations) fit_params = dict() agent_params = { 'approximator_params': approximator_params, 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = LSPI(pi, mdp.info, agent_params, features) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=1000, n_episodes_per_fit=20) # Test test_epsilon = Parameter(0.) agent.policy.set_epsilon(test_epsilon) dataset = core.evaluate(n_episodes=20) return np.mean(compute_J(dataset, 1.))
def experiment(alpha): np.random.seed() # MDP mdp = Gym(name='MountainCar-v0', horizon=np.inf, gamma=1.) # Policy epsilon = Parameter(value=0.) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = Parameter(alpha) tilings = Tiles.generate(10, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) algorithm_params = {'learning_rate': learning_rate, 'lambda': .9} fit_params = dict() agent_params = { 'approximator_params': approximator_params, 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = TrueOnlineSARSALambda(pi, mdp.info, agent_params, features) # Algorithm collect_dataset = CollectDataset() callbacks = [collect_dataset] core = Core(agent, mdp, callbacks=callbacks) # Train core.learn(n_episodes=20, n_steps_per_fit=1, render=0) dataset = collect_dataset.get() return np.mean(compute_J(dataset, 1.))
def experiment2(): np.random.seed(3) print('mushroom :') mdp = GridWorldVanHasselt() # Policy epsilon = ExponentialDecayParameter(value=1, decay_exp=.5, size=mdp.info.observation_space.size) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = ExponentialDecayParameter(value=1., decay_exp=1., size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = QLearning(pi, mdp.info, agent_params) # Algorithm collect_dataset = CollectDataset() callbacks = [collect_dataset] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=2000, n_steps_per_fit=1, quiet=True) # Train dataset = collect_dataset.get() VisualizeControlBlock(dataset) return agent.Q.table
def experiment(alg): gym.logger.setLevel(0) np.random.seed(88) tf.set_random_seed(88) # DQN settings initial_replay_size = 500 max_replay_size = 1000 train_frequency = 50 target_update_frequency = 100 evaluation_frequency = 200 max_steps = 2000 # MDP train mdp = Atari('BreakoutDeterministic-v4', 84, 84, ends_at_life=True) # Policy epsilon = LinearDecayParameter(value=1, min_value=.1, n=10) epsilon_test = Parameter(value=.05) epsilon_random = Parameter(value=1) pi = EpsGreedy(epsilon=epsilon_random) # Approximator input_shape = (84, 84, 4) approximator_params = dict( input_shape=input_shape, output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n, input_preprocessor=[Scaler(mdp.info.observation_space.high[0, 0])], optimizer={ 'name': 'rmsprop', 'lr': .00025, 'decay': .95, 'epsilon': 1e-10 }) approximator = ConvNet # Agent algorithm_params = dict(batch_size=32, initial_replay_size=initial_replay_size, n_approximators=2 if alg == 'adqn' else 1, max_replay_size=max_replay_size, history_length=4, train_frequency=train_frequency, target_update_frequency=target_update_frequency, max_no_op_actions=10, no_op_action_value=0) fit_params = dict() agent_params = { 'approximator_params': approximator_params, 'algorithm_params': algorithm_params, 'fit_params': fit_params } if alg == 'dqn': agent = DQN(approximator, pi, mdp.info, agent_params) elif alg == 'ddqn': agent = DoubleDQN(approximator, pi, mdp.info, agent_params) elif alg == 'adqn': agent = AveragedDQN(approximator, pi, mdp.info, agent_params) # Algorithm core = Core(agent, mdp) # DQN # fill replay memory with random dataset core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size, quiet=True) # evaluate initial policy pi.set_epsilon(epsilon_test) mdp.set_episode_end(ends_at_life=False) for n_epoch in xrange(1, max_steps / evaluation_frequency + 1): # learning step pi.set_epsilon(epsilon) mdp.set_episode_end(ends_at_life=True) core.learn(n_steps=evaluation_frequency, n_steps_per_fit=train_frequency, quiet=True) # evaluation step pi.set_epsilon(epsilon_test) mdp.set_episode_end(ends_at_life=False) w = agent.approximator.model.get_weights(only_trainable=True) return w
def experiment(): np.random.seed() # Argument parser parser = argparse.ArgumentParser() arg_game = parser.add_argument_group('Game') arg_game.add_argument("--name", type=str, default='BreakoutDeterministic-v4', help='Gym ID of the Atari game.') arg_game.add_argument("--screen-width", type=int, default=84, help='Width of the game screen.') arg_game.add_argument("--screen-height", type=int, default=84, help='Height of the game screen.') arg_mem = parser.add_argument_group('Replay Memory') arg_mem.add_argument("--initial-replay-size", type=int, default=50000, help='Initial size of the replay memory.') arg_mem.add_argument("--max-replay-size", type=int, default=500000, help='Max size of the replay memory.') arg_net = parser.add_argument_group('Deep Q-Network') arg_net.add_argument( "--optimizer", choices=['adadelta', 'adam', 'rmsprop', 'rmspropcentered'], default='adam', help='Name of the optimizer to use to learn.') arg_net.add_argument("--learning-rate", type=float, default=.00025, help='Learning rate value of the optimizer. Only used' 'in rmspropcentered') arg_net.add_argument("--decay", type=float, default=.95, help='Discount factor for the history coming from the' 'gradient momentum in rmspropcentered') arg_net.add_argument("--epsilon", type=float, default=.01, help='Epsilon term used in rmspropcentered') arg_alg = parser.add_argument_group('Algorithm') arg_alg.add_argument("--algorithm", choices=['dqn', 'ddqn', 'wdqn', 'adqn'], default='dqn', help='Name of the algorithm. dqn stands for standard' 'DQN, ddqn stands for Double DQN, wdqn' 'stands for Weighted DQN and adqn stands for' 'Averaged DQN.') arg_alg.add_argument("--n-approximators", type=int, default=1, help="Number of approximators used in the ensemble" "for Weighted DQN and Averaged DQN.") arg_alg.add_argument("--batch-size", type=int, default=32, help='Batch size for each fit of the network.') arg_alg.add_argument("--history-length", type=int, default=4, help='Number of frames composing a state.') arg_alg.add_argument("--target-update-frequency", type=int, default=10000, help='Number of learning step before each update of' 'the target network.') arg_alg.add_argument("--evaluation-frequency", type=int, default=250000, help='Number of learning step before each evaluation.' 'This number represents an epoch.') arg_alg.add_argument("--train-frequency", type=int, default=4, help='Number of learning steps before each fit of the' 'neural network.') arg_alg.add_argument("--max-steps", type=int, default=50000000, help='Total number of learning steps.') arg_alg.add_argument( "--final-exploration-frame", type=int, default=1000000, help='Number of steps until the exploration rate stops' 'decreasing.') arg_alg.add_argument("--initial-exploration-rate", type=float, default=1., help='Initial value of the exploration rate.') arg_alg.add_argument("--final-exploration-rate", type=float, default=.1, help='Final value of the exploration rate. When it' 'reaches this values, it stays constant.') arg_alg.add_argument("--test-exploration-rate", type=float, default=.05, help='Exploration rate used during evaluation.') arg_alg.add_argument("--test-samples", type=int, default=125000, help='Number of steps for each evaluation.') arg_alg.add_argument("--max-no-op-actions", type=int, default=30, help='Maximum number of no-op action performed at the' 'beginning of the episodes. The minimum number is' 'history_length.') arg_alg.add_argument("--no-op-action-value", type=int, default=0, help='Value of the no-op action.') arg_utils = parser.add_argument_group('Utils') arg_utils.add_argument('--load-path', type=str, help='Path of the model to be loaded.') arg_utils.add_argument('--save', action='store_true', help='Flag specifying whether to save the model.') arg_utils.add_argument('--render', action='store_true', help='Flag specifying whether to render the game.') arg_utils.add_argument('--quiet', action='store_true', help='Flag specifying whether to hide the progress' 'bar.') arg_utils.add_argument('--debug', action='store_true', help='Flag specifying whether the script has to be' 'run in debug mode.') args = parser.parse_args() scores = list() # Evaluation of the model provided by the user. if args.load_path: # MDP mdp = Atari(args.name, args.screen_width, args.screen_height, ends_at_life=True) # Policy epsilon_test = Parameter(value=args.test_exploration_rate) pi = EpsGreedy(epsilon=epsilon_test) # Approximator input_shape = (args.screen_height, args.screen_width, args.history_length) approximator_params = dict( input_shape=input_shape, output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n, input_preprocessor=[Scaler(mdp.info.observation_space.high[0, 0])], name='test', load_path=args.load_path, optimizer={ 'name': args.optimizer, 'lr': args.learning_rate, 'decay': args.decay, 'epsilon': args.epsilon }) approximator = ConvNet # Agent algorithm_params = dict(max_replay_size=0, history_length=args.history_length, max_no_op_actions=args.max_no_op_actions, no_op_action_value=args.no_op_action_value) fit_params = dict() agent_params = { 'approximator_params': approximator_params, 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = DQN(approximator, pi, mdp.info, agent_params) # Algorithm core_test = Core(agent, mdp) # Evaluate model pi.set_epsilon(epsilon_test) mdp.set_episode_end(ends_at_life=False) dataset = core_test.evaluate(n_steps=args.test_samples, render=args.render, quiet=args.quiet) get_stats(dataset) else: # DQN learning run # Summary folder folder_name = './logs/atari_' + args.algorithm + '_' + args.name +\ '_' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') # Settings if args.debug: initial_replay_size = 50 max_replay_size = 500 train_frequency = 5 target_update_frequency = 10 test_samples = 20 evaluation_frequency = 50 max_steps = 1000 else: initial_replay_size = args.initial_replay_size max_replay_size = args.max_replay_size train_frequency = args.train_frequency target_update_frequency = args.target_update_frequency test_samples = args.test_samples evaluation_frequency = args.evaluation_frequency max_steps = args.max_steps # MDP mdp = Atari(args.name, args.screen_width, args.screen_height, ends_at_life=True) # Policy epsilon = LinearDecayParameter(value=args.initial_exploration_rate, min_value=args.final_exploration_rate, n=args.final_exploration_frame) epsilon_test = Parameter(value=args.test_exploration_rate) epsilon_random = Parameter(value=1) pi = EpsGreedy(epsilon=epsilon_random) # Approximator input_shape = (args.screen_height, args.screen_width, args.history_length) approximator_params = dict( input_shape=input_shape, output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n, input_preprocessor=[Scaler(mdp.info.observation_space.high[0, 0])], folder_name=folder_name, optimizer={ 'name': args.optimizer, 'lr': args.learning_rate, 'decay': args.decay, 'epsilon': args.epsilon }) approximator = ConvNet # Agent algorithm_params = dict( batch_size=args.batch_size, n_approximators=args.n_approximators, initial_replay_size=initial_replay_size, max_replay_size=max_replay_size, history_length=args.history_length, train_frequency=train_frequency, target_update_frequency=target_update_frequency, max_no_op_actions=args.max_no_op_actions, no_op_action_value=args.no_op_action_value) fit_params = dict() agent_params = { 'approximator_params': approximator_params, 'algorithm_params': algorithm_params, 'fit_params': fit_params } if args.algorithm == 'dqn': agent = DQN(approximator, pi, mdp.info, agent_params) elif args.algorithm == 'ddqn': agent = DoubleDQN(approximator, pi, mdp.info, agent_params) elif args.algorithm == 'wdqn': agent = WeightedDQN(approximator, pi, mdp.info, agent_params) elif args.algorithm == 'adqn': agent = AveragedDQN(approximator, pi, mdp.info, agent_params) # Algorithm core = Core(agent, mdp) core_test = Core(agent, mdp) # RUN # Fill replay memory with random dataset print_epoch(0) core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size, quiet=args.quiet) if args.save: agent.approximator.model.save() # Evaluate initial policy pi.set_epsilon(epsilon_test) mdp.set_episode_end(ends_at_life=False) if args.algorithm == 'ddqn': agent.policy.set_q(agent.target_approximator) dataset = core_test.evaluate(n_steps=test_samples, render=args.render, quiet=args.quiet) scores.append(get_stats(dataset)) if args.algorithm == 'ddqn': agent.policy.set_q(agent.approximator) np.save(folder_name + '/scores.npy', scores) for n_epoch in xrange(1, max_steps / evaluation_frequency + 1): print_epoch(n_epoch) print '- Learning:' # learning step pi.set_epsilon(epsilon) mdp.set_episode_end(ends_at_life=True) core.learn(n_steps=evaluation_frequency, n_steps_per_fit=train_frequency, quiet=args.quiet) if args.save: agent.approximator.model.save() print '- Evaluation:' # evaluation step pi.set_epsilon(epsilon_test) mdp.set_episode_end(ends_at_life=False) core_test.reset() if args.algorithm == 'ddqn': agent.policy.set_q(agent.target_approximator) dataset = core_test.evaluate(n_steps=test_samples, render=args.render, quiet=args.quiet) scores.append(get_stats(dataset)) if args.algorithm == 'ddqn': agent.policy.set_q(agent.approximator) np.save(folder_name + '/scores.npy', scores) return scores
def experiment(): np.random.seed() # Argument parser parser = argparse.ArgumentParser() arg_game = parser.add_argument_group('Game') arg_game.add_argument("--name", type=str, default='BreakoutNoFrameskip-v4', help='Gym ID of the Atari game.') arg_game.add_argument("--screen-width", type=int, default=84, help='Width of the game screen.') arg_game.add_argument("--screen-height", type=int, default=84, help='Height of the game screen.') arg_mem = parser.add_argument_group('Replay Memory') arg_mem.add_argument("--initial-replay-size", type=int, default=50000, help='Initial size of the replay memory.') arg_mem.add_argument("--max-replay-size", type=int, default=1000000, help='Max size of the replay memory.') arg_net = parser.add_argument_group('Deep Q-Network') arg_net.add_argument( "--optimizer", choices=['adadelta', 'adam', 'rmsprop', 'rmspropcentered'], default='rmsprop', help='Name of the optimizer to use to learn.') arg_net.add_argument("--learning-rate", type=float, default=.00025, help='Learning rate value of the optimizer. Only used' 'in rmspropcentered') arg_net.add_argument("--decay", type=float, default=.95) arg_net.add_argument("--epsilon", type=float, default=1e-10) arg_net.add_argument("--bootInit", action='store_true', help='Initialize weights as in Bootstrapped DQN') arg_alg = parser.add_argument_group('Algorithm') arg_alg.add_argument("--weighted", action='store_true') arg_alg.add_argument("--double", action='store_true') arg_alg.add_argument("--weighted-update", action='store_true') arg_alg.add_argument( "--n-approximators", type=int, default=10, help="Number of approximators used in the ensemble for" "Averaged DQN.") arg_alg.add_argument("--loss", choices=[ 'squared_loss', 'huber_loss', ], default='squared_loss', help="Loss functions used in the approximator") arg_alg.add_argument( "--q-max", type=float, default=10, help='Upper bound for initializing the heads of the network') arg_alg.add_argument( "--q-min", type=float, default=-10, help='Lower bound for initializing the heads of the network') arg_alg.add_argument("--batch-size", type=int, default=32, help='Batch size for each fit of the network.') arg_alg.add_argument("--history-length", type=int, default=4, help='Number of frames composing a state.') arg_alg.add_argument("--target-update-frequency", type=int, default=10000, help='Number of learning step before each update of' 'the target network.') arg_alg.add_argument("--evaluation-frequency", type=int, default=250000, help='Number of learning step before each evaluation.' 'This number represents an epoch.') arg_alg.add_argument("--train-frequency", type=int, default=4, help='Number of learning steps before each fit of the' 'neural network.') arg_alg.add_argument("--max-steps", type=int, default=50000000, help='Total number of learning steps.') arg_alg.add_argument( "--final-exploration-frame", type=int, default=1, help='Number of steps until the exploration rate stops' 'decreasing.') arg_alg.add_argument("--initial-exploration-rate", type=float, default=0., help='Initial value of the exploration rate.') arg_alg.add_argument("--final-exploration-rate", type=float, default=0., help='Final value of the exploration rate. When it' 'reaches this values, it stays constant.') arg_alg.add_argument("--test-exploration-rate", type=float, default=.005, help='Exploration rate used during evaluation.') arg_alg.add_argument("--test-samples", type=int, default=125000, help='Number of steps for each evaluation.') arg_alg.add_argument("--max-no-op-actions", type=int, default=8, help='Maximum number of no-op action performed at the' 'beginning of the episodes. The minimum number is' 'history_length.') arg_alg.add_argument("--no-op-action-value", type=int, default=0, help='Value of the no-op action.') arg_alg.add_argument("--p-mask", type=float, default=1.) arg_utils = parser.add_argument_group('Utils') arg_utils.add_argument('--experiment-number', type=int, default=1, help='To differentiate experiment results') arg_utils.add_argument('--load-path', type=str, help='Path of the model to be loaded.') arg_utils.add_argument('--save', action='store_true', help='Flag specifying whether to save the model.') arg_utils.add_argument('--render', action='store_true', help='Flag specifying whether to render the game.') arg_utils.add_argument('--quiet', action='store_true', help='Flag specifying whether to hide the progress' 'bar.') arg_utils.add_argument('--debug', action='store_true', help='Flag specifying whether the script has to be' 'run in debug mode.') args = parser.parse_args() scores = list() # Evaluation of the model provided by the user. if args.load_path: mdp = Atari(args.name, args.screen_width, args.screen_height, ends_at_life=False) print("Evaluation Run") # Policy epsilon_test = Parameter(value=args.test_exploration_rate) pi = VPIPolicy(args.n_approximators, epsilon=epsilon_test) # Approximator input_shape = (args.screen_height, args.screen_width, args.history_length) approximator_params = dict(input_shape=input_shape, output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n, n_approximators=args.n_approximators, name='test', load_path=args.load_path, q_min=args.q_min, q_max=args.q_max, optimizer={ 'name': args.optimizer, 'lr': args.learning_rate, 'decay': args.decay, 'epsilon': args.epsilon }, loss=args.loss) approximator = ConvNet # Agent algorithm_params = dict( batch_size=args.batch_size, initial_replay_size=1, max_replay_size=1, history_length=args.history_length, clip_reward=True, train_frequency=args.train_frequency, n_approximators=args.n_approximators, target_update_frequency=args.target_update_frequency, max_no_op_actions=4, no_op_action_value=args.no_op_action_value, p_mask=args.p_mask, dtype=np.uint8, weighted_update=args.weighted_update) if args.double: agent = DoubleDQN(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) else: agent = DQN(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) # Algorithm core_test = Core(agent, mdp) # Evaluate model pi.set_eval(True) dataset = core_test.evaluate(n_steps=args.test_samples, render=args.render, quiet=args.quiet) get_stats(dataset) else: # DQN learning run print("Learning Run") policy_name = 'weighted' if args.weighted else 'vpi' update_rule = 'weighted_update' if args.weighted_update else 'max_mean_update' # Summary folder folder_name = './logs/' + str( args.experiment_number ) + '/' + policy_name + '/' + update_rule + '/' + args.name + "/" + args.loss + "/" + str( args.n_approximators) + "_particles" # Settings if args.debug: initial_replay_size = 50 max_replay_size = 500 train_frequency = 5 target_update_frequency = 10 test_samples = 20 evaluation_frequency = 50 max_steps = 1000 else: initial_replay_size = args.initial_replay_size max_replay_size = args.max_replay_size train_frequency = args.train_frequency target_update_frequency = args.target_update_frequency test_samples = args.test_samples evaluation_frequency = args.evaluation_frequency max_steps = args.max_steps # MDP mdp = Atari(args.name, args.screen_width, args.screen_height, ends_at_life=True) # Policy epsilon = LinearDecayParameter(value=args.initial_exploration_rate, min_value=args.final_exploration_rate, n=args.final_exploration_frame) epsilon_test = Parameter(value=args.test_exploration_rate) epsilon_random = Parameter(value=1.) if not args.weighted: pi = VPIPolicy(args.n_approximators, epsilon=epsilon_random) else: pi = WeightedPolicy(args.n_approximators, epsilon=epsilon_random) # Approximator input_shape = (args.screen_height, args.screen_width, args.history_length) approximator_params = dict(input_shape=input_shape, output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n, n_approximators=args.n_approximators, folder_name=folder_name, q_min=args.q_min, q_max=args.q_max, loss=args.loss, optimizer={ 'name': args.optimizer, 'lr': args.learning_rate, 'decay': args.decay, 'epsilon': args.epsilon }) approximator = ConvNet # Agent algorithm_params = dict( batch_size=args.batch_size, initial_replay_size=initial_replay_size, max_replay_size=max_replay_size, history_length=args.history_length, clip_reward=True, train_frequency=args.train_frequency, n_approximators=args.n_approximators, target_update_frequency=target_update_frequency, max_no_op_actions=args.max_no_op_actions, no_op_action_value=args.no_op_action_value, p_mask=args.p_mask, dtype=np.uint8, weighted_update=args.weighted_update) if args.double: agent = DoubleDQN(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) else: agent = DQN(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) # Algorithm core = Core(agent, mdp) # RUN # Fill replay memory with random dataset print_epoch(0) core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size, quiet=args.quiet) if args.save: agent.approximator.model.save() # Evaluate initial policy pi.set_eval(True) pi.set_epsilon(epsilon_test) mdp.set_episode_end(False) dataset = core.evaluate(n_steps=test_samples, render=args.render, quiet=args.quiet) scores.append(get_stats(dataset)) np.save(folder_name + '/scores.npy', scores) for n_epoch in range(1, max_steps // evaluation_frequency + 1): print_epoch(n_epoch) print('- Learning:') # learning step pi.set_eval(False) pi.set_epsilon(epsilon) mdp.set_episode_end(True) core.learn(n_steps=evaluation_frequency, n_steps_per_fit=train_frequency, quiet=args.quiet) if args.save: agent.approximator.model.save() print('- Evaluation:') # evaluation step pi.set_eval(True) pi.set_epsilon(epsilon_test) mdp.set_episode_end(False) dataset = core.evaluate(n_steps=test_samples, render=args.render, quiet=args.quiet) scores.append(get_stats(dataset)) np.save(folder_name + '/scores.npy', scores) return scores
def experiment(policy, name, folder_name): np.random.seed() # Argument parser parser = argparse.ArgumentParser() arg_mdp = parser.add_argument_group('Environment') arg_mdp.add_argument("--horizon", type=int) arg_mdp.add_argument("--gamma", type=float) arg_mem = parser.add_argument_group('Replay Memory') arg_mem.add_argument("--initial-replay-size", type=int, default=100, help='Initial size of the replay memory.') arg_mem.add_argument("--max-replay-size", type=int, default=5000, help='Max size of the replay memory.') arg_net = parser.add_argument_group('Deep Q-Network') arg_net.add_argument("--n-features", type=int, default=80) arg_net.add_argument( "--optimizer", choices=['adadelta', 'adam', 'rmsprop', 'rmspropcentered'], default='adam', help='Name of the optimizer to use to learn.') arg_net.add_argument("--learning-rate", type=float, default=.0001, help='Learning rate value of the optimizer. Only used' 'in rmspropcentered') arg_net.add_argument("--decay", type=float, default=.95, help='Discount factor for the history coming from the' 'gradient momentum in rmspropcentered') arg_net.add_argument("--epsilon", type=float, default=.01, help='Epsilon term used in rmspropcentered') arg_alg = parser.add_argument_group('Algorithm') arg_alg.add_argument("--weighted-update", action='store_true') arg_alg.add_argument( "--n-approximators", type=int, default=10, help="Number of approximators used in the ensemble for" "Averaged DQN.") arg_alg.add_argument("--batch-size", type=int, default=100, help='Batch size for each fit of the network.') arg_alg.add_argument("--history-length", type=int, default=1, help='Number of frames composing a state.') arg_alg.add_argument("--target-update-frequency", type=int, default=100, help='Number of collected samples before each update' 'of the target network.') arg_alg.add_argument("--evaluation-frequency", type=int, default=1000, help='Number of learning step before each evaluation.' 'This number represents an epoch.') arg_alg.add_argument("--train-frequency", type=int, default=1, help='Number of learning steps before each fit of the' 'neural network.') arg_alg.add_argument("--max-steps", type=int, default=50000, help='Total number of learning steps.') arg_alg.add_argument( "--final-exploration-frame", type=int, default=1, help='Number of steps until the exploration rate stops' 'decreasing.') arg_alg.add_argument("--initial-exploration-rate", type=float, default=0., help='Initial value of the exploration rate.') arg_alg.add_argument("--final-exploration-rate", type=float, default=0., help='Final value of the exploration rate. When it' 'reaches this values, it stays constant.') arg_alg.add_argument("--test-exploration-rate", type=float, default=0., help='Exploration rate used during evaluation.') arg_alg.add_argument("--test-samples", type=int, default=1000, help='Number of steps for each evaluation.') arg_alg.add_argument("--max-no-op-actions", type=int, default=0, help='Maximum number of no-op action performed at the' 'beginning of the episodes. The minimum number is' 'history_length.') arg_alg.add_argument("--no-op-action-value", type=int, default=0, help='Value of the no-op action.') arg_alg.add_argument("--p-mask", type=float, default=1.) arg_utils = parser.add_argument_group('Utils') arg_utils.add_argument('--load-path', type=str, help='Path of the model to be loaded.') arg_utils.add_argument('--save', action='store_true', help='Flag specifying whether to save the model.') arg_utils.add_argument('--render', action='store_true', help='Flag specifying whether to render the game.') arg_utils.add_argument('--quiet', action='store_true', help='Flag specifying whether to hide the progress' 'bar.') arg_utils.add_argument('--debug', action='store_true', help='Flag specifying whether the script has to be' 'run in debug mode.') args = parser.parse_args() scores = list() # Evaluation of the model provided by the user. if args.load_path: # MDP if name != 'Taxi': mdp = Gym(name, args.horizon, args.gamma) n_states = None gamma_eval = 1. else: mdp = generate_taxi('../../grid.txt') n_states = mdp.info.observation_space.size[0] gamma_eval = mdp.info.gamma # Policy epsilon_test = Parameter(value=args.test_exploration_rate) pi = BootPolicy(args.n_approximators, epsilon=epsilon_test) # Approximator input_shape = mdp.info.observation_space.shape + ( args.history_length, ) input_preprocessor = list() approximator_params = dict(input_shape=input_shape, output_shape=(mdp.info.action_space.n, ), n_states=n_states, n_actions=mdp.info.action_space.n, n_features=args.n_features, n_approximators=args.n_approximators, input_preprocessor=input_preprocessor, name='test', load_path=args.load_path, optimizer={ 'name': args.optimizer, 'lr': args.learning_rate, 'decay': args.decay, 'epsilon': args.epsilon }) approximator = SimpleNet # Agent algorithm_params = dict(batch_size=0, initial_replay_size=0, max_replay_size=0, history_length=1, clip_reward=False, n_approximators=args.n_approximators, train_frequency=1, target_update_frequency=1, max_no_op_actions=args.max_no_op_actions, no_op_action_value=args.no_op_action_value, p_mask=args.p_mask, weighted_update=args.weighted_update) agent = DoubleDQN(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) # Algorithm core_test = Core(agent, mdp) # Evaluate model pi.set_eval(True) dataset = core_test.evaluate(n_steps=args.test_samples, render=args.render, quiet=args.quiet) get_stats(dataset, gamma_eval) else: # DQN learning run # Settings if args.debug: initial_replay_size = 50 max_replay_size = 500 train_frequency = 5 target_update_frequency = 10 test_samples = 20 evaluation_frequency = 50 max_steps = 1000 else: initial_replay_size = args.initial_replay_size max_replay_size = args.max_replay_size train_frequency = args.train_frequency target_update_frequency = args.target_update_frequency test_samples = args.test_samples evaluation_frequency = args.evaluation_frequency max_steps = args.max_steps # MDP if name != 'Taxi': mdp = Gym(name, args.horizon, args.gamma) n_states = None gamma_eval = 1. else: mdp = generate_taxi('../../grid.txt') n_states = mdp.info.observation_space.size[0] gamma_eval = mdp.info.gamma # Policy epsilon = LinearDecayParameter(value=args.initial_exploration_rate, min_value=args.final_exploration_rate, n=args.final_exploration_frame) epsilon_test = Parameter(value=args.test_exploration_rate) epsilon_random = Parameter(value=1.) if policy == 'boot': pi = BootPolicy(args.n_approximators, epsilon=epsilon_random) elif policy == 'weighted': pi = WeightedPolicy(args.n_approximators, epsilon=epsilon_random) else: raise ValueError # Approximator input_shape = mdp.info.observation_space.shape + ( args.history_length, ) input_preprocessor = list() approximator_params = dict(input_shape=input_shape, output_shape=(mdp.info.action_space.n, ), n_states=n_states, n_actions=mdp.info.action_space.n, n_features=args.n_features, n_approximators=args.n_approximators, input_preprocessor=input_preprocessor, folder_name=folder_name, optimizer={ 'name': args.optimizer, 'lr': args.learning_rate, 'decay': args.decay, 'epsilon': args.epsilon }) approximator = SimpleNet # Agent algorithm_params = dict( batch_size=args.batch_size, initial_replay_size=initial_replay_size, max_replay_size=max_replay_size, history_length=args.history_length, clip_reward=False, n_approximators=args.n_approximators, train_frequency=train_frequency, target_update_frequency=target_update_frequency, max_no_op_actions=args.max_no_op_actions, no_op_action_value=args.no_op_action_value, p_mask=args.p_mask, weighted_update=args.weighted_update) agent = DoubleDQN(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) # Algorithm core = Core(agent, mdp) core_test = Core(agent, mdp) # RUN # Fill replay memory with random dataset print_epoch(0) core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size, quiet=args.quiet) if args.save: agent.approximator.model.save() # Evaluate initial policy pi.set_eval(True) pi.set_epsilon(epsilon_test) dataset = core_test.evaluate(n_steps=test_samples, render=args.render, quiet=args.quiet) scores.append(get_stats(dataset, gamma_eval)) for n_epoch in range(1, max_steps // evaluation_frequency + 1): print_epoch(n_epoch) print('- Learning:') pi.set_eval(False) pi.set_epsilon(epsilon) # learning step core.learn(n_steps=evaluation_frequency, n_steps_per_fit=train_frequency, quiet=args.quiet) if args.save: agent.approximator.model.save() print('- Evaluation:') # evaluation step core_test.reset() pi.set_eval(True) pi.set_epsilon(epsilon_test) dataset = core_test.evaluate(n_steps=test_samples, render=args.render, quiet=args.quiet) scores.append(get_stats(dataset, gamma_eval)) return scores
def experiment(args, agent_algorithm): np.random.seed() scores = list() #add timestamp to results ts = str(time.time()) # Evaluation of the model provided by the user. if args.load_path and args.evaluation: # MDP if args.name not in ['Taxi', 'Gridworld']: mdp = Gym(args.name, args.horizon, args.gamma) n_states = None gamma_eval = 1. elif args.name == 'Taxi': mdp = generate_taxi('../../grid.txt') n_states = mdp.info.observation_space.size[0] gamma_eval = mdp.info.gamma else: rew_weights = [args.fast_zone, args.slow_zone, args.goal] grid_size = args.grid_size env = GridWorld(gamma=args.gamma, rew_weights=rew_weights, shape=(grid_size, grid_size), randomized_initial=args.rand_initial, horizon=args.horizon) gamma_eval = args.gamma mdp = env.generate_mdp() n_states = mdp.info.observation_space.size[0] # Policy epsilon_test = Parameter(value=args.test_exploration_rate) pi = BootPolicy(args.n_approximators, epsilon=epsilon_test) # Approximator input_shape = mdp.info.observation_space.shape + (1, ) input_preprocessor = list() approximator_params = dict(input_shape=input_shape, output_shape=(mdp.info.action_space.n, ), n_states=n_states, n_actions=mdp.info.action_space.n, n_features=args.n_features, n_approximators=args.n_approximators, input_preprocessor=input_preprocessor, name='test', load_path=args.load_path, net_type=args.net_type, optimizer={ 'name': args.optimizer, 'lr': args.learning_rate, 'lr_sigma': args.learning_rate, 'decay': args.decay, 'epsilon': args.epsilon }) approximator = SimpleNet # Agent algorithm_params = dict(batch_size=0, initial_replay_size=0, max_replay_size=0, clip_reward=False, target_update_frequency=1) if args.alg == 'boot': algorithm_params['p_mask'] = args.p_mask pi = BootPolicy(args.n_approximators, epsilon=epsilon_test) elif args.alg == 'gaussian': if args.ucb: pi = UCBPolicy(delta=args.delta, q_max=1. / (1. - args.gamma)) else: pi = WeightedGaussianPolicy(epsilon=epsilon_test) elif args.alg == 'dqn': pi = EpsGreedy(epsilon=epsilon_test) elif args.alg == 'particle': if args.ucb: pi = UCBPolicy(delta=args.delta, q_max=1. / (1. - args.gamma)) else: pi = WeightedPolicy(args.n_approximators, epsilon=epsilon_test) else: raise ValueError("Algorithm uknown") if args.alg in ['gaussian', 'particle']: algorithm_params['update_type'] = args.update_type algorithm_params['delta'] = args.delta algorithm_params['store_prob'] = args.store_prob if args.clip_target: algorithm_params['max_spread'] = args.q_max - args.q_min approximator_params['q_min'] = args.q_min approximator_params['q_max'] = args.q_max approximator_params['loss'] = args.loss approximator_params['init_type'] = args.init_type approximator_params['sigma_weight'] = args.sigma_weight if args.alg in ['particle', 'boot']: approximator_params['n_approximators'] = args.n_approximators algorithm_params['n_approximators'] = args.n_approximators agent = agent_algorithm(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) # Algorithm core_test = Core(agent, mdp) # Evaluate model pi.set_eval(True) dataset = core_test.evaluate(n_steps=args.test_samples, render=args.render, quiet=args.quiet) get_stats(dataset) else: # DQN learning run print("Learning Run") # Settings if args.debug: initial_replay_size = 50 max_replay_size = 500 train_frequency = 5 target_update_frequency = 10 test_samples = 20 evaluation_frequency = 50 max_steps = 1000 else: initial_replay_size = args.initial_replay_size max_replay_size = args.max_replay_size train_frequency = args.train_frequency target_update_frequency = args.target_update_frequency test_samples = args.test_samples evaluation_frequency = args.evaluation_frequency max_steps = args.max_steps # MDP if args.name not in ['Taxi', 'Gridworld']: mdp = Gym(args.name, args.horizon, args.gamma) n_states = None gamma_eval = 1. elif args.name == 'Taxi': mdp = generate_taxi('../../grid.txt') n_states = mdp.info.observation_space.size[0] gamma_eval = mdp.info.gamma else: rew_weights = [args.fast_zone, args.slow_zone, args.goal] grid_size = args.grid_size env = GridWorld(gamma=args.gamma, rew_weights=rew_weights, shape=(grid_size, grid_size), randomized_initial=args.rand_initial, horizon=args.horizon) mdp = env.generate_mdp() n_states = mdp.info.observation_space.size[0] print(mdp.info.gamma) gamma_eval = args.gamma # Policy epsilon = LinearDecayParameter(value=args.initial_exploration_rate, min_value=args.final_exploration_rate, n=args.final_exploration_frame) epsilon_test = Parameter(value=args.test_exploration_rate) epsilon_random = Parameter(value=1.) policy_name = 'weighted' update_rule = args.update_type + "_update" if args.alg == 'boot': pi = BootPolicy(args.n_approximators, epsilon=epsilon) policy_name = 'boot' update_rule = 'boot' elif args.alg == 'dqn': pi = EpsGreedy(epsilon=epsilon) policy_name = 'eps_greedy' update_rule = 'td' elif args.alg == 'particle': if args.ucb: policy_name = 'ucb' pi = UCBPolicy(delta=args.delta, q_max=1. / (1. - args.gamma)) else: pi = WeightedPolicy(args.n_approximators) elif args.alg == 'gaussian': if args.ucb: policy_name = 'ucb' pi = UCBPolicy(delta=args.delta, q_max=1. / (1. - args.gamma)) else: pi = WeightedGaussianPolicy() else: raise ValueError("Algorithm unknown") # Summary folder folder_name = './logs/' + args.alg + "/" + policy_name + '/' + update_rule + '/' + args.name + "/" + args.loss + "/" + str( args.n_approximators ) + "_particles" + "/" + args.init_type + "_init" + "/" + str( args.learning_rate) + "/" + ts # Approximator input_shape = mdp.info.observation_space.shape input_preprocessor = list() approximator_params = dict(input_shape=input_shape, output_shape=(mdp.info.action_space.n, ), n_states=n_states, n_actions=mdp.info.action_space.n, n_features=args.n_features, n_approximators=args.n_approximators, input_preprocessor=input_preprocessor, folder_name=folder_name, net_type=args.net_type, sigma_weight=args.sigma_weight, optimizer={ 'name': args.optimizer, 'lr': args.learning_rate, 'lr_sigma': args.learning_rate, 'decay': args.decay, 'epsilon': args.epsilon }) if args.load_path: ts = os.path.basename(os.path.normpath(args.load_path)) approximator_params['load_path'] = args.load_path approximator_params['folder_name'] = args.load_path folder_name = args.load_path p = "scores_" + str(ts) + ".npy" scores = np.load(p).tolist() max_steps = max_steps - evaluation_frequency * len(scores) approximator = SimpleNet # Agent algorithm_params = dict( batch_size=args.batch_size, initial_replay_size=initial_replay_size, max_replay_size=max_replay_size, clip_reward=False, target_update_frequency=target_update_frequency // train_frequency, ) if args.alg == 'boot': algorithm_params['p_mask'] = args.p_mask elif args.alg in ['particle', 'gaussian']: algorithm_params['update_type'] = args.update_type algorithm_params['delta'] = args.delta algorithm_params['store_prob'] = args.store_prob if args.clip_target: algorithm_params['max_spread'] = args.q_max - args.q_min approximator_params['q_min'] = args.q_min approximator_params['q_max'] = args.q_max approximator_params['loss'] = args.loss approximator_params['init_type'] = args.init_type if args.alg in ['boot', 'particle']: approximator_params['n_approximators'] = args.n_approximators algorithm_params['n_approximators'] = args.n_approximators agent = agent_algorithm(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) if args.ucb: q = agent.approximator if args.alg == 'particle': def mu(state): q_list = q.predict(state).squeeze() qs = np.array(q_list) return qs.mean(axis=0) quantiles = [ i * 1. / (args.n_approximators - 1) for i in range(args.n_approximators) ] for p in range(args.n_approximators): if quantiles[p] >= 1 - args.delta: delta_index = p break def quantile_func(state): q_list = q.predict(state).squeeze() qs = np.sort(np.array(q_list), axis=0) return qs[delta_index, :] print("Setting up ucb policy") pi.set_mu(mu) pi.set_quantile_func(quantile_func) if args.alg == 'gaussian': standard_bound = norm.ppf(1 - args.delta, loc=0, scale=1) def mu(state): q_and_sigma = q.predict(state).squeeze() means = q_and_sigma[0] return means def quantile_func(state): q_and_sigma = q.predict(state).squeeze() means = q_and_sigma[0] sigmas = q_and_sigma[1] return sigmas * standard_bound + means print("Setting up ucb policy") pi.set_mu(mu) pi.set_quantile_func(quantile_func) args.count = 100 if args.plot_qs: import matplotlib.pyplot as plt colors = ['red', 'blue', 'green'] labels = ['left', 'nop', 'right'] def plot_probs(qs): args.count += 1 if args.count < 1: return ax.clear() for i in range(qs.shape[-1]): mu = np.mean(qs[..., i], axis=0) sigma = np.std(qs[..., i], axis=0) x = np.linspace(mu - 3 * sigma, mu + 3 * sigma, 20) ax.plot(x, stats.norm.pdf(x, mu, sigma), label=labels[i], color=colors[i]) ax.set_xlabel('Q-value') ax.set_ylabel('Probability') ax.set_title('Q-distributions') #ax.set_ylim(bottom=0, top=1) plt.draw() plt.pause(0.02) #print("Plotted") args.count = 0 #return probs plt.ion() fig, ax = plt.subplots() plot_probs( np.array(agent.approximator.predict(np.array(mdp.reset())))) input() args.count = 100 qs = np.array([ np.linspace(-1000, 0, 10), np.linspace(-2000, -1000, 10), np.linspace(-750, -250, 10) ]) plot_probs(qs.T) # Algorithm core = Core(agent, mdp) core_test = Core(agent, mdp) # RUN # Fill replay memory with random dataset print_epoch(0) core.learn( n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size, quiet=args.quiet, ) if args.save: agent.approximator.model.save() # Evaluate initial policy if hasattr(pi, 'set_eval'): pi.set_eval(True) pi.set_epsilon(epsilon_test) dataset = core_test.evaluate(n_steps=test_samples, render=args.render, quiet=args.quiet) scores.append(get_stats(dataset)) if args.plot_qs: pi.set_plotter(plot_probs) np.save(folder_name + '/scores_' + str(ts) + '.npy', scores) for n_epoch in range(1, max_steps // evaluation_frequency + 1): print_epoch(n_epoch) print('- Learning:') # learning step if hasattr(pi, 'set_eval'): pi.set_eval(False) pi.set_epsilon(epsilon) # learning step if args.plot_qs: pi.set_plotter(None) core.learn( n_steps=evaluation_frequency, n_steps_per_fit=train_frequency, quiet=args.quiet, ) if args.save: agent.approximator.model.save() print('- Evaluation:') # evaluation step if hasattr(pi, 'set_eval'): pi.set_eval(True) pi.set_epsilon(epsilon_test) if args.plot_qs: pi.set_plotter(plot_probs) dataset = core_test.evaluate(n_steps=test_samples, render=args.render, quiet=args.quiet) scores.append(get_stats(dataset)) np.save(folder_name + '/scores_' + str(ts) + '.npy', scores) return scores
def experiment(): np.random.seed() #tf.set_random_seed() # Argument parser parser = argparse.ArgumentParser() arg_game = parser.add_argument_group('Game') arg_game.add_argument("--name", type=str, default='BreakoutNoFrameskip-v4', help='Gym ID of the Atari game.') arg_game.add_argument("--screen-width", type=int, default=84, help='Width of the game screen.') arg_game.add_argument("--screen-height", type=int, default=84, help='Height of the game screen.') arg_mem = parser.add_argument_group('Replay Memory') arg_mem.add_argument("--initial-replay-size", type=int, default=50000, help='Initial size of the replay memory.') arg_mem.add_argument("--max-replay-size", type=int, default=1000000, help='Max size of the replay memory.') arg_net = parser.add_argument_group('Deep Q-Network') arg_net.add_argument( "--optimizer", choices=['adadelta', 'adam', 'rmsprop', 'rmspropcentered'], default='rmsprop', help='Name of the optimizer to use to learn.') arg_net.add_argument("--learning-rate", type=float, default=.00025, help='Learning rate value of the optimizer. Only used' 'in rmspropcentered') arg_net.add_argument( "--lr-sigma", type=float, default=.1e-6, help='Learning rate value of the optimizer for sigma. Only used' 'in GaussianDQN') arg_net.add_argument("--decay", type=float, default=.95) arg_net.add_argument("--epsilon", type=float, default=1e-8) arg_alg = parser.add_argument_group('Algorithm') arg_alg.add_argument("--alg", choices=['boot', 'particle', 'gaussian', 'dqn'], default='particle', help='Algorithm to use') arg_alg.add_argument("--weighted", action='store_true') arg_alg.add_argument("--ucb", action='store_true') arg_alg.add_argument("--boot", action='store_true', help="Flag to use BootstrappedDQN.") arg_alg.add_argument("--gaussian", action='store_true', help="Flag to use GaussianDQN.") arg_alg.add_argument( "--double", action='store_true', help="Flag to use the DoubleDQN version of the algorithm.") arg_alg.add_argument( "--update-type", choices=['mean', 'weighted', 'optimistic'], default='mean', help='Kind of update to perform (only WQL algorithms).') arg_alg.add_argument("--multiple-nets", action='store_true', help="if to use separate nets for every environment") arg_alg.add_argument( "--n-approximators", type=int, default=10, help="Number of approximators used in the ensemble for" "Averaged DQN.") arg_alg.add_argument("--loss", choices=['squared_loss', 'huber_loss', 'triple_loss'], default='huber_loss', help="Loss functions used in the approximator") arg_alg.add_argument("--delta", type=float, default=0.1, help='Parameter of ucb policy') arg_alg.add_argument( "--q-max", type=float, default=100, help='Upper bound for initializing the heads of the network') arg_alg.add_argument( "--q-min", type=float, default=0, help='Lower bound for initializing the heads of the network') arg_alg.add_argument("--sigma-weight", type=float, default=1.0, help='Used in gaussian learning to explore more') arg_alg.add_argument("--init-type", choices=['boot', 'linspace'], default='linspace', help='Type of initialization for the network') arg_alg.add_argument("--batch-size", type=int, default=32, help='Batch size for each fit of the network.') arg_alg.add_argument("--history-length", type=int, default=4, help='Number of frames composing a state.') arg_alg.add_argument("--target-update-frequency", type=int, default=10000, help='Number of learning step before each update of' 'the target network.') arg_alg.add_argument("--evaluation-frequency", type=int, default=250000, help='Number of learning step before each evaluation.' 'This number represents an epoch.') arg_alg.add_argument("--train-frequency", type=int, default=4, help='Number of learning steps before each fit of the' 'neural network.') arg_alg.add_argument("--max-steps", type=int, default=50000000, help='Total number of learning steps.') arg_alg.add_argument( "--final-exploration-frame", type=int, default=1000000, help='Number of steps until the exploration rate stops' 'decreasing.') arg_alg.add_argument("--initial-exploration-rate", type=float, default=1., help='Initial value of the exploration rate.') arg_alg.add_argument("--final-exploration-rate", type=float, default=0.05, help='Final value of the exploration rate. When it' 'reaches this values, it stays constant.') arg_alg.add_argument("--test-exploration-rate", type=float, default=0.05, help='Exploration rate used during evaluation.') arg_alg.add_argument("--test-samples", type=int, default=125000, help='Number of steps for each evaluation.') arg_alg.add_argument("--max-no-op-actions", type=int, default=30, help='Maximum number of no-op action performed at the' 'beginning of the episodes. The minimum number is' 'history_length.') arg_alg.add_argument("--p-mask", type=float, default=1.) arg_utils = parser.add_argument_group('Utils') arg_utils.add_argument('--load-path', type=str, help='Path of the model to be loaded.') arg_utils.add_argument('--save', action='store_true', help='Flag specifying whether to save the model.') arg_utils.add_argument( '--evaluation', action='store_true', help='Flag specifying whether the model loaded will be evaluated.') arg_utils.add_argument('--render', action='store_true', help='Flag specifying whether to render the game.') arg_utils.add_argument('--quiet', action='store_true', help='Flag specifying whether to hide the progress' 'bar.') arg_utils.add_argument('--debug', action='store_true', help='Flag specifying whether the script has to be' 'run in debug mode.') arg_utils.add_argument("--device", type=int, default=0, help='Index of the GPU.') args = parser.parse_args() os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # see issue #152 os.environ["CUDA_VISIBLE_DEVICES"] = str(args.device) from particle_dqn import ParticleDQN, ParticleDoubleDQN from bootstrapped_dqn import BootstrappedDoubleDQN, BootstrappedDQN from gaussian_dqn import GaussianDQN from dqn import DoubleDQN, DQN from mushroom.core.core import Core from mushroom.environments import Atari from mushroom.utils.dataset import compute_scores from mushroom.utils.parameters import LinearDecayParameter, Parameter from policy import BootPolicy, WeightedPolicy, WeightedGaussianPolicy, EpsGreedy, UCBPolicy if args.alg == 'boot': from boot_net import ConvNet if args.double: agent_algorithm = BootstrappedDoubleDQN else: agent_algorithm = BootstrappedDQN elif args.alg == 'gaussian': from gaussian_net import GaussianNet as ConvNet agent_algorithm = GaussianDQN elif args.alg == 'dqn': from dqn_net import ConvNet if args.double: agent_algorithm = DoubleDQN else: agent_algorithm = DQN else: if args.multiple_nets: from net_multiple import ConvNet print("Using Multiple Nets") else: from net import ConvNet if args.double: agent_algorithm = ParticleDoubleDQN else: agent_algorithm = ParticleDQN def get_stats(dataset): score = compute_scores(dataset) print('min_reward: %f, max_reward: %f, mean_reward: %f,' ' games_completed: %d' % score) return score scores = list() #add timestamp to results ts = str(time.time()) # Evaluation of the model provided by the user. if args.load_path and args.evaluation: mdp = Atari(args.name, args.screen_width, args.screen_height, ends_at_life=False, history_length=args.history_length, max_no_op_actions=args.max_no_op_actions) print("Evaluation Run") # Policy epsilon_test = Parameter(value=args.test_exploration_rate) # Approximator input_shape = (args.screen_height, args.screen_width, args.history_length) approximator_params = dict( input_shape=input_shape, output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n, name='test', load_path=args.load_path, optimizer={ 'name': args.optimizer, 'lr': args.learning_rate, 'lr_sigma': args.lr_sigma, 'decay': args.decay, 'epsilon': args.epsilon }, ) approximator = ConvNet # Agent algorithm_params = dict( batch_size=args.batch_size, initial_replay_size=1, max_replay_size=1, clip_reward=True, target_update_frequency=args.target_update_frequency // args.train_frequency, ) if args.alg == 'boot': algorithm_params['p_mask'] = args.p_mask pi = BootPolicy(args.n_approximators, epsilon=epsilon_test) elif args.alg == 'gaussian': if args.ucb: pi = UCBPolicy(delta=args.delta) else: pi = WeightedGaussianPolicy(epsilon=epsilon_test) elif args.alg == 'dqn': pi = EpsGreedy(epsilon=epsilon_test) elif args.alg == 'particle': if args.ucb: pi = UCBPolicy(args.n_approximators, epsilon=epsilon_test) else: pi = WeightedPolicy(delta=args.delta) else: raise ValueError("Algorithm uknown") if args.alg in ['gaussian', 'particle']: algorithm_params['update_type'] = args.update_type algorithm_params['delta'] = args.delta approximator_params['q_min'] = args.q_min approximator_params['q_max'] = args.q_max approximator_params['loss'] = args.loss approximator_params['init_type'] = args.init_type approximator_params['sigma_weight'] = args.sigma_weight if args.alg in ['particle', 'boot']: approximator_params['n_approximators'] = args.n_approximators algorithm_params['n_approximators'] = args.n_approximators agent = agent_algorithm(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) #print(agent) # Algorithm core_test = Core(agent, mdp) # Evaluate model pi.set_eval(True) dataset = core_test.evaluate(n_steps=args.test_samples, render=args.render, quiet=args.quiet) get_stats(dataset) else: # DQN learning run print("Learning Run") # Settings if args.debug: initial_replay_size = 50 max_replay_size = 500 train_frequency = 5 target_update_frequency = 10 test_samples = 20 evaluation_frequency = 50 max_steps = 1000 else: initial_replay_size = args.initial_replay_size max_replay_size = args.max_replay_size train_frequency = args.train_frequency target_update_frequency = args.target_update_frequency test_samples = args.test_samples evaluation_frequency = args.evaluation_frequency max_steps = args.max_steps # MDP mdp = Atari(args.name, args.screen_width, args.screen_height, ends_at_life=True) # Policy epsilon = LinearDecayParameter(value=args.initial_exploration_rate, min_value=args.final_exploration_rate, n=args.final_exploration_frame) epsilon_test = Parameter(value=args.test_exploration_rate) epsilon_random = Parameter(value=1.) policy_name = 'weighted' update_rule = args.update_type + "_update" if args.alg == 'boot': pi = BootPolicy(args.n_approximators, epsilon=epsilon) policy_name = 'boot' update_rule = 'boot' elif args.alg == 'dqn': pi = EpsGreedy(epsilon=epsilon) policy_name = 'eps_greedy' update_rule = 'td' elif args.alg == 'particle': if args.ucb: pi = UCBPolicy(delta=args.delta) else: pi = WeightedPolicy(args.n_approximators) elif args.alg == 'gaussian': if args.ucb: pi = UCBPolicy(delta=args.delta) else: pi = WeightedGaussianPolicy() else: raise ValueError("Algorithm unknown") # Summary folder folder_name = './logs/' + args.alg + "/" + policy_name + '/' + update_rule + '/' + args.name + "/" + args.loss + "/" + str( args.n_approximators ) + "_particles" + "/" + args.init_type + "_init" + "/" + str( args.learning_rate) + "/" + ts # Approximator input_shape = (args.screen_height, args.screen_width, args.history_length) approximator_params = dict(input_shape=input_shape, output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n, folder_name=folder_name, sigma_weight=args.sigma_weight, optimizer={ 'name': args.optimizer, 'lr': args.learning_rate, 'decay': args.decay, 'epsilon': args.epsilon, 'lr_sigma': args.lr_sigma }) if args.load_path: ts = os.path.basename(os.path.normpath(args.load_path)) approximator_params['load_path'] = args.load_path approximator_params['folder_name'] = args.load_path folder_name = args.load_path p = "scores.npy" scores = np.load(p).tolist() max_steps = max_steps - evaluation_frequency * len(scores) approximator = ConvNet # Agent algorithm_params = dict( batch_size=args.batch_size, initial_replay_size=initial_replay_size, max_replay_size=max_replay_size, clip_reward=True, target_update_frequency=target_update_frequency // args.train_frequency) if args.alg == 'boot': algorithm_params['p_mask'] = args.p_mask elif args.alg in ['particle', 'gaussian']: algorithm_params['update_type'] = args.update_type algorithm_params['delta'] = args.delta approximator_params['q_min'] = args.q_min approximator_params['q_max'] = args.q_max approximator_params['loss'] = args.loss approximator_params['init_type'] = args.init_type if args.alg in ['boot', 'particle']: approximator_params['n_approximators'] = args.n_approximators algorithm_params['n_approximators'] = args.n_approximators agent = agent_algorithm(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) if args.ucb: q = agent.approximator if args.alg == 'particle': def mu(state): q_list = q.predict(state).squeeze() qs = np.array(q_list) return qs.mean(axis=0) quantiles = [ i * 1. / (args.n_approximators - 1) for i in range(args.n_approximators) ] for p in range(args.n_approximators): if quantiles[p] >= 1 - args.delta: delta_index = p break def quantile_func(state): q_list = q.predict(state).squeeze() qs = np.array(q_list) return qs[delta_index, :] pi.set_mu(mu) pi.set_quantile_func(quantile_func) if args.alg == 'gaussian': raise ValueError("Not implemented") # Algorithm core = Core(agent, mdp) # RUN # Fill replay memory with random dataset print_epoch(0) core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size, quiet=args.quiet) if args.save: agent.approximator.model.save() # Evaluate initial policy if hasattr(pi, 'set_eval'): pi.set_eval(True) pi.set_epsilon(epsilon_test) mdp.set_episode_end(False) dataset = core.evaluate(n_steps=test_samples, render=args.render, quiet=args.quiet) scores.append(get_stats(dataset)) np.save(folder_name + '/scores.npy', scores) for n_epoch in range(1, max_steps // evaluation_frequency + 1): print_epoch(n_epoch) print('- Learning:') # learning step if hasattr(pi, 'set_eval'): pi.set_eval(False) pi.set_epsilon(epsilon) mdp.set_episode_end(True) core.learn(n_steps=evaluation_frequency, n_steps_per_fit=train_frequency, quiet=args.quiet) if args.save: agent.approximator.model.save() print('- Evaluation:') # evaluation step if hasattr(pi, 'set_eval'): pi.set_eval(True) pi.set_epsilon(epsilon_test) mdp.set_episode_end(False) dataset = core.evaluate(n_steps=test_samples, render=args.render, quiet=args.quiet) scores.append(get_stats(dataset)) np.save(folder_name + '/scores.npy', scores) return scores