Example #1
0
def experiment(algorithm_class, exp):
    np.random.seed()

    # MDP
    mdp = GridWorldVanHasselt()

    # Policy
    epsilon = ExponentialParameter(value=1,
                                   exp=.5,
                                   size=mdp.info.observation_space.size)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = ExponentialParameter(value=1, exp=exp, size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = algorithm_class(mdp.info, pi, **algorithm_params)

    # Algorithm
    start = mdp.convert_to_int(mdp._start, mdp._width)
    collect_max_Q = CollectMaxQ(agent.Q, start)
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset, collect_max_Q]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True)

    _, _, reward, _, _, _ = parse_dataset(collect_dataset.get())
    max_Qs = collect_max_Q.get()

    return reward, max_Qs
Example #2
0
def learn(alg, alg_params):
    mdp = CarOnHill()
    np.random.seed(1)

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Approximator
    approximator_params = dict(input_shape=mdp.info.observation_space.shape,
                               n_actions=mdp.info.action_space.n,
                               n_estimators=50,
                               min_samples_split=5,
                               min_samples_leaf=2)
    approximator = ExtraTreesRegressor

    # Agent
    agent = alg(mdp.info,
                pi,
                approximator,
                approximator_params=approximator_params,
                **alg_params)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_episodes=5, n_episodes_per_fit=5)

    test_epsilon = Parameter(0.75)
    agent.policy.set_epsilon(test_epsilon)
    dataset = core.evaluate(n_episodes=2)

    return agent, np.mean(compute_J(dataset, mdp.info.gamma))
Example #3
0
    def default(cls,
                lr=.0001,
                network=DQNNetwork,
                initial_replay_size=50000,
                max_replay_size=1000000,
                batch_size=32,
                target_update_frequency=2500,
                n_steps_per_fit=1,
                use_cuda=False):
        policy = EpsGreedy(epsilon=Parameter(value=1.))

        approximator_params = dict(network=network,
                                   optimizer={
                                       'class': optim.Adam,
                                       'params': {
                                           'lr': lr
                                       }
                                   },
                                   loss=F.smooth_l1_loss,
                                   use_cuda=use_cuda)

        alg_params = dict(initial_replay_size=initial_replay_size,
                          max_replay_size=max_replay_size,
                          batch_size=batch_size,
                          target_update_frequency=target_update_frequency)

        return cls(policy, TorchApproximator, approximator_params, alg_params,
                   n_steps_per_fit)
Example #4
0
def test_collect_Q():
    np.random.seed(88)
    mdp = GridWorld(3, 3, (2, 2))

    eps = Parameter(0.1)
    pi = EpsGreedy(eps)
    alpha = Parameter(0.1)
    agent = SARSA(mdp.info, pi, alpha)

    callback_q = CollectQ(agent.Q)
    callback_max_q = CollectMaxQ(agent.Q, np.array([2]))

    core = Core(agent, mdp, callbacks=[callback_q, callback_max_q])

    core.learn(n_steps=1000, n_steps_per_fit=1, quiet=True)

    V_test = np.array([2.4477574, 0.02246188, 1.6210059, 6.01867052])
    V = callback_q.get()[-1]

    assert np.allclose(V[0, :], V_test)

    V_max = np.array([np.max(x[2, :], axis=-1) for x in callback_q.get()])
    max_q = np.array(callback_max_q.get())

    assert np.allclose(V_max, max_q)
Example #5
0
def experiment(algorithm_class, exp):
    np.random.seed()

    # MDP
    p = np.load('chain_structure/p.npy')
    rew = np.load('chain_structure/rew.npy')
    mdp = FiniteMDP(p, rew, gamma=.9)

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = ExponentialParameter(value=1., exp=exp, size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = algorithm_class(mdp.info, pi, **algorithm_params)

    # Algorithm
    collect_Q = CollectQ(agent.approximator)
    callbacks = [collect_Q]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=20000, n_steps_per_fit=1, quiet=True)

    Qs = collect_Q.get()

    return Qs
Example #6
0
def main(argv):
    #env = retro.make(game='MegaMan-Nes', obs_type=retro.Observations.RAM)
    env = RetroEnvironment('MegaMan-Nes', 5000, 0.9, obs_shape=256**3, \
     obs_type=retro.Observations.RAM, \
     use_restricted_actions=retro.Actions.DISCRETE)

    epsilon = Parameter(value=1.)
    learning_rate = Parameter(value=0.3)

    policy = EpsGreedy(epsilon=epsilon)
    agent = SARSA(env.info, policy, learning_rate)
    #agent = CustomSARSA(env.info, policy, learning_rate)

    core = Core(agent, env)
    core.learn(n_steps=10000, n_steps_per_fit=1)
    core.evaluate(n_episodes=10, render=True)
    '''print(agent.Q.shape, file=sys.stderr)
	shape = agent.Q.shape
	q = np.zeros(shape)
	for i in range(shape[0]):
		for j in range(shape[1]):
			state = np.array([i])
			action = np.array([j])
			q[i, j] = agent.Q.predict(state, action)
	print(q)'''

    return 0
Example #7
0
def main(argv):
    #env = retro.make(game='MegaMan-Nes', obs_type=retro.Observations.RAM)
    SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
    retro.data.Integrations.add_custom_path(
        os.path.join(SCRIPT_DIR, "integrations"))
    print("megamanmain" in retro.data.list_games(
        inttype=retro.data.Integrations.ALL))
    env = RetroEnvironment('megamanmain', 5000, 0.9, obs_shape=256**3, \
     obs_type=retro.Observations.RAM, \
     use_restricted_actions=retro.Actions.DISCRETE,inttype=retro.data.Integrations.ALL)

    epsilon = Parameter(value=1.)
    learning_rate = Parameter(value=0.3)

    policy = EpsGreedy(epsilon=epsilon)

    agent = QLearning(env.info, policy, learning_rate)
    #agent = CustomSARSA(env.info, policy, learning_rate)

    core = Core(agent, env)
    core.learn(n_episodes=50, n_steps_per_fit=1)
    core.evaluate(n_episodes=2, render=True)

    # print(agent.Q.shape, file=sys.stderr)
    # shape = agent.Q.shape
    # q = np.zeros(shape)
    # for i in range(shape[0]):
    # 	for j in range(shape[1]):
    # 		state = np.array([i])
    # 		action = np.array([j])
    # 		q[i, j] = agent.Q.predict(state, action)
    # print(q)

    return 0
Example #8
0
def experiment():
    np.random.seed()

    # MDP
    mdp = generate_simple_chain(state_n=5, goal_states=[2], prob=.8, rew=1,
                                gamma=.9)

    # Policy
    epsilon = Parameter(value=.15)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = Parameter(value=.2)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = QLearning(mdp.info, pi, **algorithm_params)

    # Core
    core = Core(agent, mdp)

    # Initial policy Evaluation
    dataset = core.evaluate(n_steps=1000)
    J = np.mean(compute_J(dataset, mdp.info.gamma))
    print('J start:', J)

    # Train
    core.learn(n_steps=10000, n_steps_per_fit=1)

    # Final Policy Evaluation
    dataset = core.evaluate(n_steps=1000)
    J = np.mean(compute_J(dataset, mdp.info.gamma))
    print('J final:', J)
def learn_lspi():
    np.random.seed(1)

    # MDP
    mdp = CartPole()

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    basis = [PolynomialBasis()]
    features = Features(basis_list=basis)

    fit_params = dict()
    approximator_params = dict(input_shape=(features.size, ),
                               output_shape=(mdp.info.action_space.n, ),
                               n_actions=mdp.info.action_space.n)
    agent = LSPI(mdp.info,
                 pi,
                 approximator_params=approximator_params,
                 fit_params=fit_params,
                 features=features)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_episodes=10, n_episodes_per_fit=10)

    return agent
Example #10
0
def test_lspi():
    np.random.seed(1)

    # MDP
    mdp = CartPole()

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    basis = [PolynomialBasis()]
    features = Features(basis_list=basis)

    fit_params = dict()
    approximator_params = dict(input_shape=(features.size, ),
                               output_shape=(mdp.info.action_space.n, ),
                               n_actions=mdp.info.action_space.n)
    agent = LSPI(mdp.info,
                 pi,
                 approximator_params=approximator_params,
                 fit_params=fit_params,
                 features=features)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_episodes=10, n_episodes_per_fit=10)

    w = agent.approximator.get_weights()
    w_test = np.array([-1.00749128, -1.13444655, -0.96620322])

    assert np.allclose(w, w_test)
Example #11
0
def learn(alg, alg_params):
    # MDP
    mdp = CartPole()
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed(1)

    # Policy
    epsilon_random = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon_random)

    # Approximator
    input_shape = mdp.info.observation_space.shape
    approximator_params = dict(
        network=Network if alg is not CategoricalDQN else FeatureNetwork,
        optimizer={
            'class': optim.Adam,
            'params': {
                'lr': .001
            }
        },
        loss=F.smooth_l1_loss,
        input_shape=input_shape,
        output_shape=mdp.info.action_space.size,
        n_actions=mdp.info.action_space.n,
        n_features=2,
        use_cuda=False)

    # Agent
    if alg not in [DuelingDQN, CategoricalDQN]:
        agent = alg(mdp.info,
                    pi,
                    TorchApproximator,
                    approximator_params=approximator_params,
                    **alg_params)
    elif alg is CategoricalDQN:
        agent = alg(mdp.info,
                    pi,
                    approximator_params=approximator_params,
                    n_atoms=2,
                    v_min=-1,
                    v_max=1,
                    **alg_params)
    else:
        agent = alg(mdp.info,
                    pi,
                    approximator_params=approximator_params,
                    **alg_params)

    # Algorithm
    core = Core(agent, mdp)

    core.learn(n_steps=500, n_steps_per_fit=5)

    return agent
Example #12
0
def test_dataset_utils():
    np.random.seed(88)

    mdp = GridWorld(3, 3, (2, 2))
    epsilon = Parameter(value=0.)
    alpha = Parameter(value=0.)
    pi = EpsGreedy(epsilon=epsilon)

    agent = SARSA(mdp.info, pi, alpha)
    core = Core(agent, mdp)

    dataset = core.evaluate(n_episodes=10)

    J = compute_J(dataset, mdp.info.gamma)
    J_test = np.array([
        1.16106307e-03, 2.78128389e-01, 1.66771817e+00, 3.09031544e-01,
        1.19725152e-01, 9.84770902e-01, 1.06111661e-02, 2.05891132e+00,
        2.28767925e+00, 4.23911583e-01
    ])
    assert np.allclose(J, J_test)

    L = episodes_length(dataset)
    L_test = np.array([87, 35, 18, 34, 43, 23, 66, 16, 15, 31])
    assert np.array_equal(L, L_test)

    dataset_ep = select_first_episodes(dataset, 3)
    J = compute_J(dataset_ep, mdp.info.gamma)
    assert np.allclose(J, J_test[:3])

    L = episodes_length(dataset_ep)
    assert np.allclose(L, L_test[:3])

    samples = select_random_samples(dataset, 2)
    s, a, r, ss, ab, last = parse_dataset(samples)
    s_test = np.array([[6.], [1.]])
    a_test = np.array([[0.], [1.]])
    r_test = np.zeros(2)
    ss_test = np.array([[3], [4]])
    ab_test = np.zeros(2)
    last_test = np.zeros(2)
    assert np.array_equal(s, s_test)
    assert np.array_equal(a, a_test)
    assert np.array_equal(r, r_test)
    assert np.array_equal(ss, ss_test)
    assert np.array_equal(ab, ab_test)
    assert np.array_equal(last, last_test)

    index = np.sum(L_test[:2]) + L_test[2] // 2
    min_J, max_J, mean_J, n_episodes = compute_metrics(dataset[:index],
                                                       mdp.info.gamma)
    assert min_J == 0.0011610630703530948
    assert max_J == 0.2781283894436937
    assert mean_J == 0.1396447262570234
    assert n_episodes == 2
Example #13
0
def experiment():
    np.random.seed()

    # MDP
    mdp = CarOnHill()

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Approximator
    approximator_params = dict(input_shape=mdp.info.observation_space.shape,
                               n_actions=mdp.info.action_space.n,
                               n_estimators=50,
                               min_samples_split=5,
                               min_samples_leaf=2)
    approximator = ExtraTreesRegressor

    # Agent
    algorithm_params = dict(n_iterations=20)
    agent = FQI(mdp.info,
                pi,
                approximator,
                approximator_params=approximator_params,
                **algorithm_params)

    # Algorithm
    core = Core(agent, mdp)

    # Render
    core.evaluate(n_episodes=1, render=True)

    # Train
    core.learn(n_episodes=1000, n_episodes_per_fit=1000)

    # Test
    test_epsilon = Parameter(0.)
    agent.policy.set_epsilon(test_epsilon)

    initial_states = np.zeros((289, 2))
    cont = 0
    for i in range(-8, 9):
        for j in range(-8, 9):
            initial_states[cont, :] = [0.125 * i, 0.375 * j]
            cont += 1

    dataset = core.evaluate(initial_states=initial_states)

    # Render
    core.evaluate(n_episodes=3, render=True)

    return np.mean(compute_J(dataset, mdp.info.gamma))
Example #14
0
def main(argv):
    mdp = RetroFullEnvironment(
        'MegaMan-Nes',
        5000,
        0.9,
        #obs_type=retro.Observations.RAM,
        use_restricted_actions=retro.Actions.DISCRETE)

    epsilon = Parameter(value=1.)
    learning_rate = Parameter(value=0.3)

    train_frequency = 4
    evaluation_frequency = 250000
    target_update_frequency = 10000
    initial_replay_size = 50000
    #initial_replay_size = 500
    max_replay_size = 500000
    test_samples = 125000
    max_steps = 50000000

    policy = EpsGreedy(epsilon=epsilon)

    optimizer = {'class': Adam, 'params': dict(lr=0.00025)}

    approximator = KerasApproximator
    approximator_params = dict(network=model,
                               input_shape=mdp.info.observation_space.shape,
                               output_shape=(mdp.info.action_space.n, ),
                               n_actions=mdp.info.action_space.n,
                               n_features=2048,
                               optimizer=optimizer,
                               loss=mean_squared_error,
                               print_summary=True)

    algorithm_params = dict(batch_size=32,
                            target_update_frequency=target_update_frequency //
                            train_frequency,
                            replay_memory=None,
                            initial_replay_size=initial_replay_size,
                            max_replay_size=max_replay_size)
    agent = DQN(mdp.info,
                policy,
                approximator,
                approximator_params=approximator_params,
                **algorithm_params)

    core = Core(agent, mdp)
    #core.learn(n_steps=1000000, n_steps_per_fit=1)
    core.learn(n_steps=100000, n_steps_per_fit=1)
    core.evaluate(n_episodes=10, render=True)

    return 0
Example #15
0
def experiment():
    np.random.seed()

    # MDP
    mdp = CartPole()

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    basis = [PolynomialBasis()]

    s1 = np.array([-np.pi, 0, np.pi]) * .25
    s2 = np.array([-1, 0, 1])
    for i in s1:
        for j in s2:
            basis.append(GaussianRBF(np.array([i, j]), np.array([1.])))
    features = Features(basis_list=basis)

    fit_params = dict()
    approximator_params = dict(input_shape=(features.size, ),
                               output_shape=(mdp.info.action_space.n, ),
                               n_actions=mdp.info.action_space.n)
    agent = LSPI(mdp.info,
                 pi,
                 approximator_params=approximator_params,
                 fit_params=fit_params,
                 features=features)

    # Algorithm
    core = Core(agent, mdp)
    core.evaluate(n_episodes=3, render=True)

    # Train
    core.learn(n_episodes=100, n_episodes_per_fit=100)

    # Test
    test_epsilon = Parameter(0.)
    agent.policy.set_epsilon(test_epsilon)

    dataset = core.evaluate(n_episodes=1, quiet=True)

    core.evaluate(n_steps=100, render=True)

    return np.mean(episodes_length(dataset))
Example #16
0
def experiment(alpha):
    np.random.seed()

    # MDP
    mdp = Gym(name='Acrobot-v1', horizon=np.inf, gamma=1.)

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    n_tilings = 10
    tilings = Tiles.generate(n_tilings, [10, 10, 10, 10, 10, 10],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high)
    features = Features(tilings=tilings)

    learning_rate = Parameter(alpha / n_tilings)

    approximator_params = dict(input_shape=(features.size, ),
                               output_shape=(mdp.info.action_space.n, ),
                               n_actions=mdp.info.action_space.n)
    algorithm_params = {'learning_rate': learning_rate, 'lambda_coeff': .9}

    agent = TrueOnlineSARSALambda(mdp.info,
                                  pi,
                                  approximator_params=approximator_params,
                                  features=features,
                                  **algorithm_params)

    #shape = agent.approximator.Q
    #print(agent.Q)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_episodes=10, n_steps_per_fit=1, render=True)
    dataset = core.evaluate(n_episodes=1, render=False)
    #print(dataset)
    print(episodes_length(dataset))

    return np.mean(compute_J(dataset, .96))
def experiment():
    np.random.seed()

    logger = Logger(QLearning.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + QLearning.__name__)

    # MDP
    mdp = generate_simple_chain(state_n=5,
                                goal_states=[2],
                                prob=.8,
                                rew=1,
                                gamma=.9)

    # Policy
    epsilon = Parameter(value=.15)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = Parameter(value=.2)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = QLearning(mdp.info, pi, **algorithm_params)

    # Core
    core = Core(agent, mdp)

    # Initial policy Evaluation
    dataset = core.evaluate(n_steps=1000)
    J = np.mean(compute_J(dataset, mdp.info.gamma))
    logger.info(f'J start: {J}')

    # Train
    core.learn(n_steps=10000, n_steps_per_fit=1)

    # Final Policy Evaluation
    dataset = core.evaluate(n_steps=1000)
    J = np.mean(compute_J(dataset, mdp.info.gamma))
    logger.info(f'J final: {J}')
Example #18
0
def test_collect_dataset():
    np.random.seed(88)
    callback = CollectDataset()

    mdp = GridWorld(4, 4, (2, 2))

    eps = Parameter(0.1)
    pi = EpsGreedy(eps)
    alpha = Parameter(0.2)
    agent = SARSA(mdp.info, pi, alpha)

    core = Core(agent, mdp, callbacks=[callback])

    core.learn(n_steps=10, n_steps_per_fit=1, quiet=True)

    dataset = callback.get()
    assert len(dataset) == 10
    core.learn(n_steps=5, n_steps_per_fit=1, quiet=True)
    assert len(dataset) == 15

    callback.clean()
    dataset = callback.get()
    assert len(dataset) == 0
Example #19
0
    def default(cls,
                lr=.0001,
                network=DQNFeatureNetwork,
                initial_replay_size=50000,
                max_replay_size=1000000,
                batch_size=32,
                target_update_frequency=2500,
                n_features=512,
                n_steps_per_fit=1,
                v_min=-10,
                v_max=10,
                n_atoms=51,
                use_cuda=False):

        policy = EpsGreedy(epsilon=Parameter(value=1.))

        approximator_params = dict(network=network,
                                   n_features=n_features,
                                   optimizer={
                                       'class': optim.Adam,
                                       'params': {
                                           'lr': lr
                                       }
                                   },
                                   loss=CategoricalDQNBuilder.categorical_loss,
                                   use_cuda=use_cuda)

        alg_params = dict(initial_replay_size=initial_replay_size,
                          max_replay_size=max_replay_size,
                          batch_size=batch_size,
                          n_atoms=n_atoms,
                          v_min=v_min,
                          v_max=v_max,
                          target_update_frequency=target_update_frequency)

        return cls(policy, TorchApproximator, approximator_params, alg_params,
                   n_steps_per_fit)
Example #20
0
def test_collect_parameter():
    np.random.seed(88)
    mdp = GridWorld(3, 3, (2, 2))

    eps = ExponentialParameter(value=1,
                               exp=.5,
                               size=mdp.info.observation_space.size)
    pi = EpsGreedy(eps)
    alpha = Parameter(0.1)
    agent = SARSA(mdp.info, pi, alpha)

    callback_eps = CollectParameters(eps, 1)

    core = Core(agent, mdp, callbacks=[callback_eps])

    core.learn(n_steps=10, n_steps_per_fit=1, quiet=True)

    eps_test = np.array([
        1., 0.70710678, 0.70710678, 0.57735027, 0.57735027, 0.57735027,
        0.57735027, 0.57735027, 0.57735027, 0.57735027
    ])
    eps = callback_eps.get()

    assert np.allclose(eps, eps_test)
Example #21
0
def experiment():
    np.random.seed()

    # Argument parser
    parser = argparse.ArgumentParser()

    arg_game = parser.add_argument_group('Game')
    arg_game.add_argument("--name",
                          type=str,
                          default='BreakoutDeterministic-v4',
                          help='Gym ID of the Atari game.')
    arg_game.add_argument("--screen-width",
                          type=int,
                          default=84,
                          help='Width of the game screen.')
    arg_game.add_argument("--screen-height",
                          type=int,
                          default=84,
                          help='Height of the game screen.')

    arg_mem = parser.add_argument_group('Replay Memory')
    arg_mem.add_argument("--initial-replay-size",
                         type=int,
                         default=50000,
                         help='Initial size of the replay memory.')
    arg_mem.add_argument("--max-replay-size",
                         type=int,
                         default=500000,
                         help='Max size of the replay memory.')
    arg_mem.add_argument("--prioritized",
                         action='store_true',
                         help='Whether to use prioritized memory or not.')

    arg_net = parser.add_argument_group('Deep Q-Network')
    arg_net.add_argument(
        "--optimizer",
        choices=['adadelta', 'adam', 'rmsprop', 'rmspropcentered'],
        default='rmsprop',
        help='Name of the optimizer to use.')
    arg_net.add_argument("--learning-rate",
                         type=float,
                         default=.00025,
                         help='Learning rate value of the optimizer.')
    arg_net.add_argument("--decay",
                         type=float,
                         default=.95,
                         help='Discount factor for the history coming from the'
                         'gradient momentum in rmspropcentered and'
                         'rmsprop')
    arg_net.add_argument("--epsilon",
                         type=float,
                         default=1e-8,
                         help='Epsilon term used in rmspropcentered and'
                         'rmsprop')

    arg_alg = parser.add_argument_group('Algorithm')
    arg_alg.add_argument("--algorithm",
                         choices=['dqn', 'ddqn', 'adqn', 'mmdqn', 'cdqn'],
                         default='dqn',
                         help='Name of the algorithm. dqn is for standard'
                         'DQN, ddqn is for Double DQN and adqn is for'
                         'Averaged DQN.')
    arg_alg.add_argument(
        "--n-approximators",
        type=int,
        default=1,
        help="Number of approximators used in the ensemble for"
        "AveragedDQN or MaxminDQN.")
    arg_alg.add_argument("--batch-size",
                         type=int,
                         default=32,
                         help='Batch size for each fit of the network.')
    arg_alg.add_argument("--history-length",
                         type=int,
                         default=4,
                         help='Number of frames composing a state.')
    arg_alg.add_argument("--target-update-frequency",
                         type=int,
                         default=10000,
                         help='Number of collected samples before each update'
                         'of the target network.')
    arg_alg.add_argument("--evaluation-frequency",
                         type=int,
                         default=250000,
                         help='Number of collected samples before each'
                         'evaluation. An epoch ends after this number of'
                         'steps')
    arg_alg.add_argument("--train-frequency",
                         type=int,
                         default=4,
                         help='Number of collected samples before each fit of'
                         'the neural network.')
    arg_alg.add_argument("--max-steps",
                         type=int,
                         default=50000000,
                         help='Total number of collected samples.')
    arg_alg.add_argument(
        "--final-exploration-frame",
        type=int,
        default=1000000,
        help='Number of collected samples until the exploration'
        'rate stops decreasing.')
    arg_alg.add_argument("--initial-exploration-rate",
                         type=float,
                         default=1.,
                         help='Initial value of the exploration rate.')
    arg_alg.add_argument("--final-exploration-rate",
                         type=float,
                         default=.1,
                         help='Final value of the exploration rate. When it'
                         'reaches this values, it stays constant.')
    arg_alg.add_argument("--test-exploration-rate",
                         type=float,
                         default=.05,
                         help='Exploration rate used during evaluation.')
    arg_alg.add_argument("--test-samples",
                         type=int,
                         default=125000,
                         help='Number of collected samples for each'
                         'evaluation.')
    arg_alg.add_argument(
        "--max-no-op-actions",
        type=int,
        default=30,
        help='Maximum number of no-op actions performed at the'
        'beginning of the episodes.')
    arg_alg.add_argument("--n-atoms",
                         type=int,
                         default=51,
                         help='Number of atoms for Categorical DQN.')
    arg_alg.add_argument("--v-min",
                         type=int,
                         default=-10,
                         help='Minimum action-value for Categorical DQN.')
    arg_alg.add_argument("--v-max",
                         type=int,
                         default=10,
                         help='Maximum action-value for Categorical DQN.')

    arg_utils = parser.add_argument_group('Utils')
    arg_utils.add_argument('--use-cuda',
                           action='store_true',
                           help='Flag specifying whether to use the GPU.')
    arg_utils.add_argument('--save',
                           action='store_true',
                           help='Flag specifying whether to save the model.')
    arg_utils.add_argument('--load-path',
                           type=str,
                           help='Path of the model to be loaded.')
    arg_utils.add_argument('--render',
                           action='store_true',
                           help='Flag specifying whether to render the game.')
    arg_utils.add_argument('--quiet',
                           action='store_true',
                           help='Flag specifying whether to hide the progress'
                           'bar.')
    arg_utils.add_argument('--debug',
                           action='store_true',
                           help='Flag specifying whether the script has to be'
                           'run in debug mode.')

    args = parser.parse_args()

    scores = list()

    optimizer = dict()
    if args.optimizer == 'adam':
        optimizer['class'] = optim.Adam
        optimizer['params'] = dict(lr=args.learning_rate, eps=args.epsilon)
    elif args.optimizer == 'adadelta':
        optimizer['class'] = optim.Adadelta
        optimizer['params'] = dict(lr=args.learning_rate, eps=args.epsilon)
    elif args.optimizer == 'rmsprop':
        optimizer['class'] = optim.RMSprop
        optimizer['params'] = dict(lr=args.learning_rate,
                                   alpha=args.decay,
                                   eps=args.epsilon)
    elif args.optimizer == 'rmspropcentered':
        optimizer['class'] = optim.RMSprop
        optimizer['params'] = dict(lr=args.learning_rate,
                                   alpha=args.decay,
                                   eps=args.epsilon,
                                   centered=True)
    else:
        raise ValueError

    # Summary folder
    folder_name = './logs/atari_' + args.algorithm + '_' + args.name +\
        '_' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    pathlib.Path(folder_name).mkdir(parents=True)

    # Settings
    if args.debug:
        initial_replay_size = 50
        max_replay_size = 500
        train_frequency = 5
        target_update_frequency = 10
        test_samples = 20
        evaluation_frequency = 50
        max_steps = 1000
    else:
        initial_replay_size = args.initial_replay_size
        max_replay_size = args.max_replay_size
        train_frequency = args.train_frequency
        target_update_frequency = args.target_update_frequency
        test_samples = args.test_samples
        evaluation_frequency = args.evaluation_frequency
        max_steps = args.max_steps

    # MDP
    mdp = Atari(args.name,
                args.screen_width,
                args.screen_height,
                ends_at_life=True,
                history_length=args.history_length,
                max_no_op_actions=args.max_no_op_actions)

    if args.load_path:
        # Agent
        agent = DQN.load(args.load_path)
        epsilon_test = Parameter(value=args.test_exploration_rate)
        agent.policy.set_epsilon(epsilon_test)

        # Algorithm
        core_test = Core(agent, mdp)

        # Evaluate model
        dataset = core_test.evaluate(n_steps=args.test_samples,
                                     render=args.render,
                                     quiet=args.quiet)
        get_stats(dataset)

    else:
        # Policy
        epsilon = LinearParameter(value=args.initial_exploration_rate,
                                  threshold_value=args.final_exploration_rate,
                                  n=args.final_exploration_frame)
        epsilon_test = Parameter(value=args.test_exploration_rate)
        epsilon_random = Parameter(value=1)
        pi = EpsGreedy(epsilon=epsilon_random)

        class CategoricalLoss(nn.Module):
            def forward(self, input, target):
                input = input.clamp(1e-5)

                return -torch.sum(target * torch.log(input))

        # Approximator
        approximator_params = dict(
            network=Network if args.algorithm != 'cdqn' else FeatureNetwork,
            input_shape=mdp.info.observation_space.shape,
            output_shape=(mdp.info.action_space.n, ),
            n_actions=mdp.info.action_space.n,
            n_features=Network.n_features,
            optimizer=optimizer,
            loss=F.smooth_l1_loss
            if args.algorithm != 'cdqn' else CategoricalLoss(),
            use_cuda=args.use_cuda)

        approximator = TorchApproximator

        if args.prioritized:
            replay_memory = PrioritizedReplayMemory(
                initial_replay_size,
                max_replay_size,
                alpha=.6,
                beta=LinearParameter(.4,
                                     threshold_value=1,
                                     n=max_steps // train_frequency))
        else:
            replay_memory = None

        # Agent
        algorithm_params = dict(
            batch_size=args.batch_size,
            target_update_frequency=target_update_frequency // train_frequency,
            replay_memory=replay_memory,
            initial_replay_size=initial_replay_size,
            max_replay_size=max_replay_size)

        if args.algorithm == 'dqn':
            agent = DQN(mdp.info,
                        pi,
                        approximator,
                        approximator_params=approximator_params,
                        **algorithm_params)
        elif args.algorithm == 'ddqn':
            agent = DoubleDQN(mdp.info,
                              pi,
                              approximator,
                              approximator_params=approximator_params,
                              **algorithm_params)
        elif args.algorithm == 'adqn':
            agent = AveragedDQN(mdp.info,
                                pi,
                                approximator,
                                approximator_params=approximator_params,
                                n_approximators=args.n_approximators,
                                **algorithm_params)
        elif args.algorithm == 'mmdqn':
            agent = MaxminDQN(mdp.info,
                              pi,
                              approximator,
                              approximator_params=approximator_params,
                              n_approximators=args.n_approximators,
                              **algorithm_params)
        elif args.algorithm == 'cdqn':
            agent = CategoricalDQN(mdp.info,
                                   pi,
                                   approximator_params=approximator_params,
                                   n_atoms=args.n_atoms,
                                   v_min=args.v_min,
                                   v_max=args.v_max,
                                   **algorithm_params)

        # Algorithm
        core = Core(agent, mdp)

        # RUN

        # Fill replay memory with random dataset
        print_epoch(0)
        core.learn(n_steps=initial_replay_size,
                   n_steps_per_fit=initial_replay_size,
                   quiet=args.quiet)

        if args.save:
            agent.save(folder_name + '/agent_0.msh')

        # Evaluate initial policy
        pi.set_epsilon(epsilon_test)
        mdp.set_episode_end(False)
        dataset = core.evaluate(n_steps=test_samples,
                                render=args.render,
                                quiet=args.quiet)
        scores.append(get_stats(dataset))

        np.save(folder_name + '/scores.npy', scores)
        for n_epoch in range(1, max_steps // evaluation_frequency + 1):
            print_epoch(n_epoch)
            print('- Learning:')
            # learning step
            pi.set_epsilon(epsilon)
            mdp.set_episode_end(True)
            core.learn(n_steps=evaluation_frequency,
                       n_steps_per_fit=train_frequency,
                       quiet=args.quiet)

            if args.save:
                agent.save(folder_name + '/agent_' + str(n_epoch) + '.msh')

            print('- Evaluation:')
            # evaluation step
            pi.set_epsilon(epsilon_test)
            mdp.set_episode_end(False)
            dataset = core.evaluate(n_steps=test_samples,
                                    render=args.render,
                                    quiet=args.quiet)
            scores.append(get_stats(dataset))

            np.save(folder_name + '/scores.npy', scores)

    return scores
Example #22
0
max_replay_size = 50000
test_samples = 100
max_steps = 200000
test_episodes = 50


# PO-MDP
mdp = ShowdownEnvironment()

# Policy
epsilon = LinearParameter(value=1.,
                          threshold_value=.1,
                          n=1000000)
epsilon_test = Parameter(value=.05)
epsilon_random = Parameter(value=1)
pi = EpsGreedy(epsilon=epsilon_random)

# Approximator
approximator_params = dict(
    network=Network,
    input_shape=(110,),
    output_shape=(13,),
    n_actions=13,
    optimizer=optimizer,
    loss=F.mse_loss
)

approximator = TorchApproximator

# Agent
algorithm_params = dict(
Example #23
0
# Logging learning process
from mushroom_rl.core import Core
from mushroom_rl.environments.generators import generate_simple_chain
from mushroom_rl.policy import EpsGreedy
from mushroom_rl.algorithms.value import QLearning
from mushroom_rl.utils.parameters import Parameter
from mushroom_rl.utils.dataset import compute_J
from tqdm import trange
from time import sleep
import numpy as np


# Setup simple learning environment
mdp = generate_simple_chain(state_n=5, goal_states=[2], prob=.8, rew=1, gamma=.9)
epsilon = Parameter(value=.15)
pi = EpsGreedy(epsilon=epsilon)
agent = QLearning(mdp.info, pi, learning_rate=Parameter(value=.2))
core = Core(agent, mdp)
epochs = 10

# Initial policy Evaluation
logger.info('Experiment started')
logger.strong_line()

dataset = core.evaluate(n_steps=100)
J = np.mean(compute_J(dataset, mdp.info.gamma))  # Discounted returns
R = np.mean(compute_J(dataset))  # Undiscounted returns

logger.epoch_info(0, J=J, R=R, any_label='any value')

for i in trange(epochs):
def experiment(mdp, params, prob=None):
    # Argument parser
    # parser = argparse.ArgumentParser()
    #
    # args = parser.parse_args()

    scores = list()

    optimizer = dict()
    if params['optimizer'] == 'adam':
        optimizer['class'] = optim.Adam
        optimizer['params'] = dict(lr=params['learning_rate'])
    elif params['optimizer'] == 'adadelta':
        optimizer['class'] = optim.Adadelta
        optimizer['params'] = dict(lr=params['learning_rate'])
    elif params['optimizer'] == 'rmsprop':
        optimizer['class'] = optim.RMSprop
        optimizer['params'] = dict(lr=params['learning_rate'],
                                   alpha=params['decay'])
    elif params['optimizer'] == 'rmspropcentered':
        optimizer['class'] = optim.RMSprop
        optimizer['params'] = dict(lr=params['learning_rate'],
                                   alpha=params['decay'],
                                   centered=True)
    else:
        raise ValueError

    # DQN learning run

    # Summary folder
    folder_name = os.path.join(PROJECT_DIR, 'logs', params['name'])
    if params['save']:
        pathlib.Path(folder_name).mkdir(parents=True)

    # Policy
    epsilon = ExponentialParameter(value=params['initial_exploration_rate'],
                                   exp=params['exploration_rate'],
                                   min_value=params['final_exploration_rate'],
                                   size=(1, ))

    epsilon_random = Parameter(value=1)
    epsilon_test = Parameter(value=0.01)
    pi = EpsGreedy(epsilon=epsilon_random)

    class CategoricalLoss(nn.Module):
        def forward(self, input, target):
            input = input.clamp(1e-5)

            return -torch.sum(target * torch.log(input))

    # Approximator
    input_shape = mdp.observation.shape

    resources = [[
        (mdp.north - mdp.devices[device][0]) / (mdp.north - mdp.south),
        (mdp.east - mdp.devices[device][1]) / (mdp.east - mdp.west)
    ] for device in mdp.device_ordering]
    edges = [[
        (mdp.north - mdp.graph.nodes[e[0]]['y']) / (mdp.north - mdp.south),
        (mdp.east - mdp.graph.nodes[e[0]]['x']) / (mdp.east - mdp.west),
        (mdp.north - mdp.graph.nodes[e[1]]['y']) / (mdp.north - mdp.south),
        (mdp.east - mdp.graph.nodes[e[1]]['x']) / (mdp.east - mdp.west)
    ] for e in mdp.graph.edges]

    N = {
        'SimpleResourceNetwork': SimpleResourceNetwork,
        'GraphConvolutionResourceNetwork': GraphConvolutionResourceNetwork,
    }[params['network']]
    N.n_features = params['hidden']

    approximator_params = dict(
        network=N,
        input_shape=input_shape,
        edges=edges,
        resources=resources,
        graph=mdp.graph,
        allow_wait=params['allow_wait'],
        long_term_q=params['long_term_q'],
        resource_embeddings=params['resource_embeddings'],
        edge_ordering=mdp.edge_ordering,
        device_ordering=mdp.device_ordering,
        resource_edges=mdp.resource_edges,
        output_shape=(mdp.info.action_space.n, ),
        n_actions=mdp.info.action_space.n,
        n_features=params['hidden'],
        optimizer=optimizer,
        loss=F.smooth_l1_loss,
        nn_scaling=params['nn_scaling'],
        # quiet=False,
        use_cuda=params['cuda'],
        load_path=params.get('load_path', None))

    approximator = TorchApproximator

    replay_memory = PrioritizedReplayMemory(
        params['initial_replay_size'],
        params['max_replay_size'],
        alpha=.6,
        beta=LinearParameter(.4,
                             threshold_value=1,
                             n=params['max_steps'] //
                             params['train_frequency']))

    # Agent
    algorithm_params = dict(
        batch_size=params['batch_size'],
        n_approximators=1,
        target_update_frequency=params['target_update_frequency'] //
        params['train_frequency'],
        replay_memory=replay_memory,
        initial_replay_size=params['initial_replay_size'],
        max_replay_size=params['max_replay_size'])

    clz = DoubleDQN if mdp.info.gamma >= 1 else SMDPDQN
    agent = clz(mdp.info,
                pi,
                approximator,
                approximator_params=approximator_params,
                **algorithm_params)

    lr_scheduler = torch.optim.lr_scheduler.StepLR(
        agent.approximator._impl.model._optimizer,
        step_size=1,
        gamma=params['lr_decay'],
        last_epoch=-1)  # params['max_steps'] // params['train_frequency']

    # Algorithm
    core = Core(agent, mdp)

    if 'weights' in params:
        best_weights = np.load(params['weights'])
        agent.approximator.set_weights(best_weights)
        agent.target_approximator.set_weights(best_weights)
    else:
        best_weights = agent.approximator.get_weights()

    # RUN
    pi.set_epsilon(epsilon_test)
    eval_days = [i for i in range(1, 356) if i % 13 == 1]
    ds = core.evaluate(initial_states=eval_days,
                       quiet=tuning,
                       render=params['save'])
    test_result = np.mean(compute_J(ds))
    test_result_discounted = np.mean(compute_J(ds, params['gamma']))
    print("discounted validation result", test_result_discounted)
    print("validation result", test_result)
    results = [(0, 0, test_result_discounted, test_result)]
    if params['save']:
        mdp.save_rendered(folder_name + "/epoch_init.mp4")

    # Fill replay memory with random dataset
    print_epoch(0)
    start = time()
    core.learn(n_steps=params['initial_replay_size'],
               n_steps_per_fit=params['initial_replay_size'],
               quiet=tuning)

    runtime = time() - start
    steps = 0

    if params['save']:
        with open(folder_name + "/params.json", "w") as f:
            json.dump(params, f, indent=4)
        if isinstance(agent, DQN):
            np.save(folder_name + '/weights-exp-0-0.npy',
                    agent.approximator.get_weights())

    best_score = -np.inf
    no_improvement = 0
    patience = 6

    if params['save']:
        np.save(folder_name + '/scores.npy', scores)
    for n_epoch in range(
            1, int(params['max_steps'] // params['evaluation_frequency'] + 1)):
        print_epoch(n_epoch)
        print('- Learning:')
        # learning step
        pi.set_epsilon(epsilon)
        # mdp.set_episode_end(True)
        start = time()
        core.learn(n_steps=params['evaluation_frequency'],
                   n_steps_per_fit=params['train_frequency'],
                   quiet=tuning)
        runtime += time() - start
        steps += params['evaluation_frequency']
        lr_scheduler.step()

        if params['save']:
            if isinstance(agent, DQN):
                np.save(
                    folder_name + '/weights-exp-0-' + str(n_epoch) + '.npy',
                    agent.approximator.get_weights())

        print('- Evaluation:')
        # evaluation step
        pi.set_epsilon(epsilon_test)
        ds = core.evaluate(initial_states=eval_days,
                           render=params['save'],
                           quiet=tuning)
        test_result_discounted = np.mean(compute_J(ds, params['gamma']))
        test_result = np.mean(compute_J(ds))
        print("discounted validation result", test_result_discounted)
        print("validation result", test_result)

        if params['save']:
            mdp.save_rendered(folder_name + ("/epoch%04d.mp4" % n_epoch))
        results.append((runtime, steps, test_result_discounted, test_result))

        if params['save']:
            np.savetxt(folder_name + '/scores.csv',
                       np.asarray(results),
                       delimiter=',')

        if test_result > best_score:
            no_improvement = 0
            best_score = test_result
            best_weights = agent.approximator.get_weights().copy()

            with open(folder_name + "/best_val.txt", "w") as f:
                f.write("%f" % test_result)
        else:
            no_improvement += 1
            if no_improvement >= patience:
                break

    print('---------- FINAL EVALUATION ---------')
    agent.approximator.set_weights(best_weights)
    agent.target_approximator.set_weights(best_weights)
    pi.set_epsilon(epsilon_test)
    eval_days = [i for i in range(1, 356) if i % 13 == 0]
    ds = core.evaluate(initial_states=eval_days,
                       render=params['save'],
                       quiet=tuning)
    test_result_discounted = np.mean(compute_J(ds, params['gamma']))
    test_result = np.mean(compute_J(ds))
    print("discounted test result", test_result_discounted)
    print("test result", test_result)

    with open(folder_name + "/test_result.txt", "w") as f:
        f.write("%f" % test_result)

    if params['save']:
        mdp.save_rendered(folder_name + "/epoch_test.mp4", 10000)

    return scores
import numpy as np
from sklearn.ensemble import ExtraTreesRegressor

from mushroom_rl.algorithms.value import FQI
from mushroom_rl.core import Core
from mushroom_rl.environments import CarOnHill
from mushroom_rl.policy import EpsGreedy
from mushroom_rl.utils.dataset import compute_J
from mushroom_rl.utils.parameters import Parameter

mdp = CarOnHill()

# Policy
epsilon = Parameter(value=1.)
pi = EpsGreedy(epsilon=epsilon)

# Approximator
approximator_params = dict(input_shape=mdp.info.observation_space.shape,
                           n_actions=mdp.info.action_space.n,
                           n_estimators=50,
                           min_samples_split=5,
                           min_samples_leaf=2)
approximator = ExtraTreesRegressor

# Agent
agent = FQI(approximator,
            pi,
            mdp.info,
            n_iterations=20,
            approximator_params=approximator_params)
Example #26
0
from mushroom_rl.policy import EpsGreedy
from mushroom_rl.utils.parameters import Parameter
from mushroom_rl.environments import GridWorld
from mushroom_rl.algorithms.value import QLearning
from mushroom_rl.core.core import Core
import numpy as np
from mushroom_rl.environments import Gym

mdp = Gym(name='FrozenLake-v0', horizon=np.inf, gamma=1.)

epsilon = Parameter(value=1.)
policy = EpsGreedy(epsilon=epsilon)

learning_rate = Parameter(value=.6)
agent = QLearning(mdp.info, policy, learning_rate)

core = Core(agent, mdp)
core.learn(n_steps=10000, n_steps_per_fit=1)

shape = agent.approximator.shape
q = np.zeros(shape)
print(q)
for i in range(shape[0]):
    for j in range(shape[1]):
        state = np.array([i])
        action = np.array([j])
        q[i, j] = agent.approximator.predict(state, action)
print(q)
Example #27
0
def experiment(n_epochs, n_steps, n_steps_test):
    np.random.seed()

    # MDP
    horizon = 1000
    gamma = 0.99
    gamma_eval = 1.
    mdp = Gym('Acrobot-v1', horizon, gamma)

    # Policy
    epsilon = LinearParameter(value=1., threshold_value=.01, n=5000)
    epsilon_test = Parameter(value=0.)
    epsilon_random = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon_random)

    # Settings
    initial_replay_size = 500
    max_replay_size = 5000
    target_update_frequency = 100
    batch_size = 200
    n_features = 80
    train_frequency = 1

    # Approximator
    input_shape = mdp.info.observation_space.shape
    approximator_params = dict(network=Network,
                               optimizer={
                                   'class': optim.Adam,
                                   'params': {
                                       'lr': .001
                                   }
                               },
                               loss=F.smooth_l1_loss,
                               n_features=n_features,
                               input_shape=input_shape,
                               output_shape=mdp.info.action_space.size,
                               n_actions=mdp.info.action_space.n)

    # Agent
    agent = DQN(mdp.info,
                pi,
                TorchApproximator,
                approximator_params=approximator_params,
                batch_size=batch_size,
                n_approximators=1,
                initial_replay_size=initial_replay_size,
                max_replay_size=max_replay_size,
                target_update_frequency=target_update_frequency)

    # Algorithm
    core = Core(agent, mdp)

    core.learn(n_steps=initial_replay_size,
               n_steps_per_fit=initial_replay_size)

    # RUN
    pi.set_epsilon(epsilon_test)
    dataset = core.evaluate(n_steps=n_steps_test, render=False)
    J = compute_J(dataset, gamma_eval)
    print('J: ', np.mean(J))

    for n in trange(n_epochs):
        tqdm.write('Epoch: ' + str(n))
        pi.set_epsilon(epsilon)
        core.learn(n_steps=n_steps, n_steps_per_fit=train_frequency)
        pi.set_epsilon(epsilon_test)
        dataset = core.evaluate(n_steps=n_steps_test, render=False)
        J = compute_J(dataset, gamma_eval)
        tqdm.write('J: ' + str(np.mean(J)))

    print('Press a button to visualize acrobot')
    input()
    core.evaluate(n_episodes=5, render=True)
Example #28
0
def test_normalizing_preprocessor(tmpdir):
    np.random.seed(88)

    class Network(nn.Module):
        def __init__(self, input_shape, output_shape, **kwargs):
            super().__init__()

            n_input = input_shape[-1]
            n_output = output_shape[0]

            self._h1 = nn.Linear(n_input, n_output)

            nn.init.xavier_uniform_(self._h1.weight,
                                    gain=nn.init.calculate_gain('relu'))

        def forward(self, state, action=None):
            q = F.relu(self._h1(torch.squeeze(state, 1).float()))
            if action is None:
                return q
            else:
                action = action.long()
                q_acted = torch.squeeze(q.gather(1, action))
                return q_acted

    mdp = Gym('CartPole-v0', horizon=500, gamma=.99)

    # Policy
    epsilon_random = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon_random)

    # Approximator
    input_shape = mdp.info.observation_space.shape

    approximator_params = dict(network=Network,
                               optimizer={'class':  optim.Adam,
                                          'params': {'lr': .001}},
                               loss=F.smooth_l1_loss,
                               input_shape=input_shape,
                               output_shape=mdp.info.action_space.size,
                               n_actions=mdp.info.action_space.n,
                               n_features=2, use_cuda=False)

    alg_params = dict(batch_size=5, initial_replay_size=10,
                      max_replay_size=500, target_update_frequency=50)

    agent = DQN(mdp.info, pi, TorchApproximator,
                approximator_params=approximator_params, **alg_params)

    norm_box = MinMaxPreprocessor(mdp_info=mdp.info,
                                  clip_obs=5.0, alpha=0.001)

    core = Core(agent, mdp, preprocessors=[norm_box])

    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    # training correctly
    assert (core._state.min() >= -norm_box._clip_obs
            and core._state.max() <= norm_box._clip_obs)

    # loading and setting data correctly
    state_dict1 = norm_box.get_state()
    norm_box.save(tmpdir / 'norm_box.msh')

    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    norm_box = MinMaxPreprocessor.load(tmpdir / 'norm_box.msh')
    state_dict2 = norm_box.get_state()

    assert ((state_dict1["mean"] == state_dict2["mean"]).all()
            and (state_dict1["var"] == state_dict2["var"]).all()
            and state_dict1["count"] == state_dict2["count"])

    core = Core(agent, mdp, preprocessors=[norm_box])
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)
Example #29
0
def experiment():
    np.random.seed()

    # Argument parser
    parser = argparse.ArgumentParser()

    arg_mem = parser.add_argument_group('Replay Memory')
    arg_mem.add_argument("--initial-replay-size",
                         type=int,
                         default=50000,
                         help='Initial size of the replay memory.')
    arg_mem.add_argument("--max-replay-size",
                         type=int,
                         default=500000,
                         help='Max size of the replay memory.')
    arg_mem.add_argument("--prioritized",
                         action='store_true',
                         help='Whether to use prioritized memory or not.')

    arg_net = parser.add_argument_group('Deep Q-Network')
    arg_net.add_argument(
        "--optimizer",
        choices=['adadelta', 'adam', 'rmsprop', 'rmspropcentered'],
        default='adam',
        help='Name of the optimizer to use.')
    arg_net.add_argument("--learning-rate",
                         type=float,
                         default=.0001,
                         help='Learning rate value of the optimizer.')
    arg_net.add_argument("--decay",
                         type=float,
                         default=.95,
                         help='Discount factor for the history coming from the'
                         'gradient momentum in rmspropcentered and'
                         'rmsprop')
    arg_net.add_argument("--epsilon",
                         type=float,
                         default=1e-8,
                         help='Epsilon term used in rmspropcentered and'
                         'rmsprop')

    arg_alg = parser.add_argument_group('Algorithm')
    arg_alg.add_argument("--algorithm",
                         choices=[
                             'dqn', 'ddqn', 'adqn', 'mmdqn', 'cdqn', 'dueldqn',
                             'ndqn', 'rainbow'
                         ],
                         default='dqn',
                         help='Name of the algorithm. dqn is for standard'
                         'DQN, ddqn is for Double DQN and adqn is for'
                         'Averaged DQN.')
    arg_alg.add_argument(
        "--n-approximators",
        type=int,
        default=1,
        help="Number of approximators used in the ensemble for"
        "AveragedDQN or MaxminDQN.")
    arg_alg.add_argument("--batch-size",
                         type=int,
                         default=32,
                         help='Batch size for each fit of the network.')
    arg_alg.add_argument("--history-length",
                         type=int,
                         default=4,
                         help='Number of frames composing a state.')
    arg_alg.add_argument("--target-update-frequency",
                         type=int,
                         default=10000,
                         help='Number of collected samples before each update'
                         'of the target network.')
    arg_alg.add_argument("--evaluation-frequency",
                         type=int,
                         default=250000,
                         help='Number of collected samples before each'
                         'evaluation. An epoch ends after this number of'
                         'steps')
    arg_alg.add_argument("--train-frequency",
                         type=int,
                         default=4,
                         help='Number of collected samples before each fit of'
                         'the neural network.')
    arg_alg.add_argument("--max-steps",
                         type=int,
                         default=5000000,
                         help='Total number of collected samples.')
    arg_alg.add_argument(
        "--final-exploration-frame",
        type=int,
        default=10000000,
        help='Number of collected samples until the exploration'
        'rate stops decreasing.')
    arg_alg.add_argument("--initial-exploration-rate",
                         type=float,
                         default=1.,
                         help='Initial value of the exploration rate.')
    arg_alg.add_argument("--final-exploration-rate",
                         type=float,
                         default=.1,
                         help='Final value of the exploration rate. When it'
                         'reaches this values, it stays constant.')
    arg_alg.add_argument("--test-exploration-rate",
                         type=float,
                         default=.05,
                         help='Exploration rate used during evaluation.')
    arg_alg.add_argument("--test-episodes",
                         type=int,
                         default=5,
                         help='Number of episodes for each evaluation.')
    arg_alg.add_argument(
        "--alpha-coeff",
        type=float,
        default=.6,
        help='Prioritization exponent for prioritized experience replay.')
    arg_alg.add_argument("--n-atoms",
                         type=int,
                         default=51,
                         help='Number of atoms for Categorical DQN.')
    arg_alg.add_argument("--v-min",
                         type=int,
                         default=-10,
                         help='Minimum action-value for Categorical DQN.')
    arg_alg.add_argument("--v-max",
                         type=int,
                         default=10,
                         help='Maximum action-value for Categorical DQN.')
    arg_alg.add_argument("--n-steps-return",
                         type=int,
                         default=3,
                         help='Number of steps for n-step return for Rainbow.')
    arg_alg.add_argument("--sigma-coeff",
                         type=float,
                         default=.5,
                         help='Sigma0 coefficient for noise initialization in'
                         'NoisyDQN and Rainbow.')

    arg_utils = parser.add_argument_group('Utils')
    arg_utils.add_argument('--use-cuda',
                           action='store_true',
                           help='Flag specifying whether to use the GPU.')
    arg_utils.add_argument('--save',
                           action='store_true',
                           help='Flag specifying whether to save the model.')
    arg_utils.add_argument('--load-path',
                           type=str,
                           help='Path of the model to be loaded.')
    arg_utils.add_argument('--render',
                           action='store_true',
                           help='Flag specifying whether to render the grid.')
    arg_utils.add_argument('--quiet',
                           action='store_true',
                           help='Flag specifying whether to hide the progress'
                           'bar.')
    arg_utils.add_argument('--debug',
                           action='store_true',
                           help='Flag specifying whether the script has to be'
                           'run in debug mode.')

    args = parser.parse_args()

    scores = list()

    optimizer = dict()
    if args.optimizer == 'adam':
        optimizer['class'] = optim.Adam
        optimizer['params'] = dict(lr=args.learning_rate, eps=args.epsilon)
    elif args.optimizer == 'adadelta':
        optimizer['class'] = optim.Adadelta
        optimizer['params'] = dict(lr=args.learning_rate, eps=args.epsilon)
    elif args.optimizer == 'rmsprop':
        optimizer['class'] = optim.RMSprop
        optimizer['params'] = dict(lr=args.learning_rate,
                                   alpha=args.decay,
                                   eps=args.epsilon)
    elif args.optimizer == 'rmspropcentered':
        optimizer['class'] = optim.RMSprop
        optimizer['params'] = dict(lr=args.learning_rate,
                                   alpha=args.decay,
                                   eps=args.epsilon,
                                   centered=True)
    else:
        raise ValueError

    # Summary folder
    folder_name = './logs/habitat_nav_' + args.algorithm +\
        '_' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    pathlib.Path(folder_name).mkdir(parents=True)

    # Settings
    if args.debug:
        initial_replay_size = 50
        max_replay_size = 500
        train_frequency = 5
        target_update_frequency = 10
        test_episodes = 20
        evaluation_frequency = 50
        max_steps = 1000
    else:
        initial_replay_size = args.initial_replay_size
        max_replay_size = args.max_replay_size
        train_frequency = args.train_frequency
        target_update_frequency = args.target_update_frequency
        test_episodes = args.test_episodes
        evaluation_frequency = args.evaluation_frequency
        max_steps = args.max_steps

    # MDP
    config_file = os.path.join(
        pathlib.Path(__file__).parent.resolve(),
        'pointnav_apartment-0.yaml')  # Custom task for Replica scenes
    wrapper = 'HabitatNavigationWrapper'
    mdp = Habitat(wrapper, config_file)
    opt_return = mdp.env.get_optimal_policy_return()

    if args.load_path:
        logger = Logger(DQN.__name__, results_dir=None)
        logger.strong_line()
        logger.info('Optimal Policy Undiscounted Return: ' + str(opt_return))
        logger.info('Experiment Algorithm: ' + DQN.__name__)

        # Agent
        agent = DQN.load(args.load_path)
        epsilon_test = Parameter(value=args.test_exploration_rate)
        agent.policy.set_epsilon(epsilon_test)

        # Algorithm
        core_test = Core(agent, mdp)

        # Evaluate model
        dataset = core_test.evaluate(n_episodes=args.test_episodes,
                                     render=args.render,
                                     quiet=args.quiet)
        get_stats(dataset, logger)
    else:
        # Policy
        epsilon = LinearParameter(value=args.initial_exploration_rate,
                                  threshold_value=args.final_exploration_rate,
                                  n=args.final_exploration_frame)
        epsilon_test = Parameter(value=args.test_exploration_rate)
        epsilon_random = Parameter(value=1)
        pi = EpsGreedy(epsilon=epsilon_random)

        # Approximator
        approximator_params = dict(
            network=Network if args.algorithm
            not in ['dueldqn', 'cdqn', 'ndqn', 'rainbow'] else FeatureNetwork,
            input_shape=mdp.info.observation_space.shape,
            output_shape=(mdp.info.action_space.n, ),
            n_actions=mdp.info.action_space.n,
            n_features=Network.n_features,
            optimizer=optimizer,
            use_cuda=args.use_cuda)
        if args.algorithm not in ['cdqn', 'rainbow']:
            approximator_params['loss'] = F.smooth_l1_loss

        approximator = TorchApproximator

        if args.prioritized:
            replay_memory = PrioritizedReplayMemory(
                initial_replay_size,
                max_replay_size,
                alpha=args.alpha_coeff,
                beta=LinearParameter(.4,
                                     threshold_value=1,
                                     n=max_steps // train_frequency))
        else:
            replay_memory = None

        # Agent
        algorithm_params = dict(
            batch_size=args.batch_size,
            target_update_frequency=target_update_frequency // train_frequency,
            replay_memory=replay_memory,
            initial_replay_size=initial_replay_size,
            max_replay_size=max_replay_size)

        if args.algorithm == 'dqn':
            alg = DQN
            agent = alg(mdp.info,
                        pi,
                        approximator,
                        approximator_params=approximator_params,
                        **algorithm_params)
        elif args.algorithm == 'ddqn':
            alg = DoubleDQN
            agent = alg(mdp.info,
                        pi,
                        approximator,
                        approximator_params=approximator_params,
                        **algorithm_params)
        elif args.algorithm == 'adqn':
            alg = AveragedDQN
            agent = alg(mdp.info,
                        pi,
                        approximator,
                        approximator_params=approximator_params,
                        n_approximators=args.n_approximators,
                        **algorithm_params)
        elif args.algorithm == 'mmdqn':
            alg = MaxminDQN
            agent = alg(mdp.info,
                        pi,
                        approximator,
                        approximator_params=approximator_params,
                        n_approximators=args.n_approximators,
                        **algorithm_params)
        elif args.algorithm == 'dueldqn':
            alg = DuelingDQN
            agent = alg(mdp.info,
                        pi,
                        approximator_params=approximator_params,
                        **algorithm_params)
        elif args.algorithm == 'cdqn':
            alg = CategoricalDQN
            agent = alg(mdp.info,
                        pi,
                        approximator_params=approximator_params,
                        n_atoms=args.n_atoms,
                        v_min=args.v_min,
                        v_max=args.v_max,
                        **algorithm_params)
        elif args.algorithm == 'ndqn':
            alg = NoisyDQN
            agent = alg(mdp.info,
                        pi,
                        approximator_params=approximator_params,
                        sigma_coeff=args.sigma_coeff,
                        **algorithm_params)
        elif args.algorithm == 'rainbow':
            alg = Rainbow
            beta = LinearParameter(.4,
                                   threshold_value=1,
                                   n=max_steps // train_frequency)
            agent = alg(mdp.info,
                        pi,
                        approximator_params=approximator_params,
                        n_atoms=args.n_atoms,
                        v_min=args.v_min,
                        v_max=args.v_max,
                        n_steps_return=args.n_steps_return,
                        alpha_coeff=args.alpha_coeff,
                        beta=beta,
                        sigma_coeff=args.sigma_coeff,
                        **algorithm_params)

        logger = Logger(alg.__name__, results_dir=None)
        logger.strong_line()
        logger.info('Optimal Policy Undiscounted Return: ' + str(opt_return))
        logger.info('Experiment Algorithm: ' + alg.__name__)

        # Algorithm
        core = Core(agent, mdp)

        # RUN

        # Fill replay memory with random dataset
        print_epoch(0, logger)
        core.learn(n_steps=initial_replay_size,
                   n_steps_per_fit=initial_replay_size,
                   quiet=args.quiet)

        if args.save:
            agent.save(folder_name + '/agent_0.msh')

        # Evaluate initial policy
        pi.set_epsilon(epsilon_test)
        dataset = core.evaluate(n_episodes=test_episodes,
                                render=args.render,
                                quiet=args.quiet)
        scores.append(get_stats(dataset, logger))

        np.save(folder_name + '/scores.npy', scores)
        for n_epoch in range(1, max_steps // evaluation_frequency + 1):
            print_epoch(n_epoch, logger)
            logger.info('- Learning:')
            # learning step
            pi.set_epsilon(epsilon)
            core.learn(n_steps=evaluation_frequency,
                       n_steps_per_fit=train_frequency,
                       quiet=args.quiet)

            if args.save:
                agent.save(folder_name + '/agent_' + str(n_epoch) + '.msh')

            logger.info('- Evaluation:')
            # evaluation step
            pi.set_epsilon(epsilon_test)
            dataset = core.evaluate(n_episodes=test_episodes,
                                    render=args.render,
                                    quiet=args.quiet)
            scores.append(get_stats(dataset, logger))

            np.save(folder_name + '/scores.npy', scores)

    return scores