Example #1
0
def experiment(n_epochs, n_episodes):
    np.random.seed()

    logger = Logger(COPDAC_Q.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + COPDAC_Q.__name__)

    # MDP
    n_steps = 5000
    mdp = InvertedPendulum(horizon=n_steps)

    # Agent
    n_tilings = 10
    alpha_theta = Parameter(5e-3 / n_tilings)
    alpha_omega = Parameter(0.5 / n_tilings)
    alpha_v = Parameter(0.5 / n_tilings)
    tilings = Tiles.generate(n_tilings, [10, 10],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high + 1e-3)

    phi = Features(tilings=tilings)

    input_shape = (phi.size, )

    mu = Regressor(LinearApproximator,
                   input_shape=input_shape,
                   output_shape=mdp.info.action_space.shape)

    sigma = 1e-1 * np.eye(1)
    policy = GaussianPolicy(mu, sigma)

    agent = COPDAC_Q(mdp.info,
                     policy,
                     mu,
                     alpha_theta,
                     alpha_omega,
                     alpha_v,
                     value_function_features=phi,
                     policy_features=phi)

    # Train
    dataset_callback = CollectDataset()
    visualization_callback = Display(agent._V, mu,
                                     mdp.info.observation_space.low,
                                     mdp.info.observation_space.high, phi, phi)
    core = Core(agent, mdp, callbacks_fit=[dataset_callback])

    for i in trange(n_epochs, leave=False):
        core.learn(n_episodes=n_episodes, n_steps_per_fit=1, render=False)
        J = compute_J(dataset_callback.get(), gamma=1.0)
        dataset_callback.clean()
        visualization_callback()
        logger.epoch_info(i + 1, R_mean=np.sum(J) / n_steps / n_episodes)

    logger.info('Press a button to visualize the pendulum...')
    input()
    sigma = 1e-8 * np.eye(1)
    policy.set_sigma(sigma)
    core.evaluate(n_steps=n_steps, render=True)
Example #2
0
def experiment(alg, params, n_epochs, n_episodes, n_ep_per_fit):
    np.random.seed()
    print('============ start experiment ============')
    logger = Logger(alg.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + alg.__name__)

    # MDP
    mdp = GraspEnv()
    print('============ mdp ============')

    # Policy
    n_weights = 6
    mu = np.array([-0.5, 0.0, 0.91, m.pi, 0, 0])
    sigma = np.asarray([0.05, 0.05, 0.05, 0.1, 0.1,
                        0.1])  #np.asarray([0.15, 0.15, 0.15, 0.4, 0.4, 0.4])
    policy = Own_policy()
    dist = GaussianDiagonalDistribution(
        mu, sigma)  # TODO: is this distribution right? Yes.
    agent = alg(mdp.info, dist, policy, **params)

    # Train
    dataset_callback = CollectDataset(
    )  # TODO: should we also collect the dataset? Just keep this.
    core = Core(agent, mdp, callbacks_fit=[dataset_callback])
    #core = Core(agent, mdp)

    for i in range(n_epochs):
        print('================ core learn ================')
        core.learn(n_episodes=n_episodes, n_episodes_per_fit=n_ep_per_fit)

        J = compute_J(dataset_callback.get(), gamma=mdp.info.gamma)
        print('J:', J)
        print('============================')
        dataset_callback.clean()  # Todo: learning curve? Done

        p = dist.get_parameters()
        print('p:', p)
        mu_0.append(p[:n_weights][0])
        mu_1.append(p[:n_weights][1])
        mu_2.append(p[:n_weights][2])
        mu_3.append(p[:n_weights][3])
        mu_4.append(p[:n_weights][4])
        mu_5.append(p[:n_weights][5])

        current_avg_sigma = (p[n_weights:][0] + p[n_weights:][1] +
                             p[n_weights:][2] + p[n_weights:][3] +
                             p[n_weights:][4] + p[n_weights:][5]) / 6
        avg_sigma.append(current_avg_sigma)

        # record learning curve of cumulative rewards
        logger.epoch_info(i + 1,
                          J=np.mean(J),
                          mu=p[:n_weights],
                          sigma=p[n_weights:])
        list_J.append(np.mean(J))
Example #3
0
def experiment(alg, env_id, horizon, gamma, n_epochs, n_steps, n_steps_per_fit,
               n_step_test, alg_params, policy_params):
    logger = Logger(A2C.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + A2C.__name__)

    mdp = Gym(env_id, horizon, gamma)

    critic_params = dict(network=Network,
                         optimizer={
                             'class': optim.RMSprop,
                             'params': {
                                 'lr': 7e-4,
                                 'eps': 1e-5
                             }
                         },
                         loss=F.mse_loss,
                         n_features=64,
                         batch_size=64,
                         input_shape=mdp.info.observation_space.shape,
                         output_shape=(1, ))

    alg_params['critic_params'] = critic_params

    policy = GaussianTorchPolicy(Network, mdp.info.observation_space.shape,
                                 mdp.info.action_space.shape, **policy_params)

    agent = alg(mdp.info, policy, **alg_params)

    core = Core(agent, mdp)

    dataset = core.evaluate(n_steps=n_step_test, render=False)

    J = np.mean(compute_J(dataset, mdp.info.gamma))
    R = np.mean(compute_J(dataset))
    E = agent.policy.entropy()

    logger.epoch_info(0, J=J, R=R, entropy=E)

    for it in trange(n_epochs):
        core.learn(n_steps=n_steps, n_steps_per_fit=n_steps_per_fit)
        dataset = core.evaluate(n_steps=n_step_test, render=False)

        J = np.mean(compute_J(dataset, mdp.info.gamma))
        R = np.mean(compute_J(dataset))
        E = agent.policy.entropy()

        logger.epoch_info(it + 1, J=J, R=R, entropy=E)

    logger.info('Press a button to visualize')
    input()
    core.evaluate(n_episodes=5, render=True)
def experiment(alg, params, n_epochs, fit_per_epoch, ep_per_fit):
    np.random.seed()

    logger = Logger(alg.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + alg.__name__)

    # MDP
    mdp = ShipSteering()

    # Policy
    high = [150, 150, np.pi]
    low = [0, 0, -np.pi]
    n_tiles = [5, 5, 6]
    low = np.array(low, dtype=np.float)
    high = np.array(high, dtype=np.float)
    n_tilings = 1

    tilings = Tiles.generate(n_tilings=n_tilings,
                             n_tiles=n_tiles,
                             low=low,
                             high=high)

    phi = Features(tilings=tilings)
    input_shape = (phi.size, )

    approximator = Regressor(LinearApproximator,
                             input_shape=input_shape,
                             output_shape=mdp.info.action_space.shape)

    policy = DeterministicPolicy(approximator)

    mu = np.zeros(policy.weights_size)
    sigma = 4e-1 * np.ones(policy.weights_size)
    distribution = GaussianDiagonalDistribution(mu, sigma)

    # Agent
    agent = alg(mdp.info, distribution, policy, features=phi, **params)

    # Train
    core = Core(agent, mdp)
    dataset_eval = core.evaluate(n_episodes=ep_per_fit)
    J = compute_J(dataset_eval, gamma=mdp.info.gamma)
    logger.epoch_info(0, J=np.mean(J))

    for i in range(n_epochs):
        core.learn(n_episodes=fit_per_epoch * ep_per_fit,
                   n_episodes_per_fit=ep_per_fit)
        dataset_eval = core.evaluate(n_episodes=ep_per_fit)
        J = compute_J(dataset_eval, gamma=mdp.info.gamma)
        logger.epoch_info(i + 1, J=np.mean(J))
Example #5
0
def experiment(alg, params, n_epochs, n_episodes, n_ep_per_fit):
    np.random.seed()

    logger = Logger(alg.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + alg.__name__)

    # MDP
    mdp = Segway()

    # Policy
    approximator = Regressor(LinearApproximator,
                             input_shape=mdp.info.observation_space.shape,
                             output_shape=mdp.info.action_space.shape)

    n_weights = approximator.weights_size
    mu = np.zeros(n_weights)
    sigma = 2e-0 * np.ones(n_weights)
    policy = DeterministicPolicy(approximator)
    dist = GaussianDiagonalDistribution(mu, sigma)

    agent = alg(mdp.info, dist, policy, **params)

    # Train
    dataset_callback = CollectDataset()
    core = Core(agent, mdp, callbacks_fit=[dataset_callback])

    for i in trange(n_epochs, leave=False):
        core.learn(n_episodes=n_episodes,
                   n_episodes_per_fit=n_ep_per_fit,
                   render=False)
        J = compute_J(dataset_callback.get(), gamma=mdp.info.gamma)
        dataset_callback.clean()

        p = dist.get_parameters()

        logger.epoch_info(i + 1,
                          J=np.mean(J),
                          mu=p[:n_weights],
                          sigma=p[n_weights:])

    logger.info('Press a button to visualize the segway...')
    input()
    core.evaluate(n_episodes=3, render=True)
Example #6
0
def experiment(alg, n_epochs, n_iterations, ep_per_run):
    np.random.seed()

    logger = Logger(alg.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + alg.__name__)

    # MDP
    mdp = LQR.generate(dimensions=2, max_action=1., max_pos=1.)

    approximator = Regressor(LinearApproximator,
                             input_shape=mdp.info.observation_space.shape,
                             output_shape=mdp.info.action_space.shape)

    sigma = Regressor(LinearApproximator,
                      input_shape=mdp.info.observation_space.shape,
                      output_shape=mdp.info.action_space.shape)

    sigma_weights = 0.25 * np.ones(sigma.weights_size)
    sigma.set_weights(sigma_weights)

    policy = StateStdGaussianPolicy(approximator, sigma)

    # Agent
    optimizer = AdaptiveOptimizer(eps=1e-2)
    algorithm_params = dict(optimizer=optimizer)
    agent = alg(mdp.info, policy, **algorithm_params)

    # Train
    core = Core(agent, mdp)
    dataset_eval = core.evaluate(n_episodes=ep_per_run)
    J = compute_J(dataset_eval, gamma=mdp.info.gamma)
    logger.epoch_info(0,
                      J=np.mean(J),
                      policy_weights=policy.get_weights().tolist())

    for i in trange(n_epochs, leave=False):
        core.learn(n_episodes=n_iterations * ep_per_run,
                   n_episodes_per_fit=ep_per_run)
        dataset_eval = core.evaluate(n_episodes=ep_per_run)
        J = compute_J(dataset_eval, gamma=mdp.info.gamma)
        logger.epoch_info(i + 1,
                          J=np.mean(J),
                          policy_weights=policy.get_weights().tolist())
Example #7
0
def experiment(goal, use_muscles, n_epochs, n_steps, n_episodes_test):
    np.random.seed(1)

    logger = Logger('SAC', results_dir=None)
    logger.strong_line()
    logger.info('Humanoid Experiment, Algorithm: SAC')

    # MDP
    gamma = 0.99
    horizon = 2000
    mdp = create_mdp(gamma, horizon, goal, use_muscles=use_muscles)

    # Agent
    agent = create_SAC_agent(mdp)

    # normalization callback
    normalizer = MinMaxPreprocessor(mdp_info=mdp.info)

    # plotting callback
    plotter = PlotDataset(mdp.info)

    # Algorithm(with normalization and plotting)
    core = Core(agent, mdp, callback_step=plotter, preprocessors=[normalizer])
    dataset = core.evaluate(n_episodes=n_episodes_test, render=True)

    J = np.mean(compute_J(dataset, gamma))
    L = int(np.round(np.mean(episodes_length(dataset))))

    logger.epoch_info(0, J=J, episode_lenght=L)

    # training loop
    for n in trange(n_epochs, leave=False):
        core.learn(n_steps=n_steps, n_steps_per_fit=1)
        dataset = core.evaluate(n_episodes=n_episodes_test, render=True)

        J = np.mean(compute_J(dataset, gamma))
        L = int(np.round(np.mean(episodes_length(dataset))))


        logger.epoch_info(n+1, J=J, episode_lenght=L)

    logger.info('Press a button to visualize humanoid')
    input()
    core.evaluate(n_episodes=10, render=True)
Example #8
0
def experiment(alg, params, n_epochs, fit_per_epoch, ep_per_fit):
    np.random.seed()

    logger = Logger(alg.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + alg.__name__)

    # MDP
    mdp = LQR.generate(dimensions=1)

    approximator = Regressor(LinearApproximator,
                             input_shape=mdp.info.observation_space.shape,
                             output_shape=mdp.info.action_space.shape)

    policy = DeterministicPolicy(mu=approximator)

    mu = np.zeros(policy.weights_size)
    sigma = 1e-3 * np.eye(policy.weights_size)
    distribution = GaussianCholeskyDistribution(mu, sigma)

    # Agent
    agent = alg(mdp.info, distribution, policy, **params)

    # Train
    core = Core(agent, mdp)
    dataset_eval = core.evaluate(n_episodes=ep_per_fit)
    J = compute_J(dataset_eval, gamma=mdp.info.gamma)
    logger.epoch_info(0,
                      J=np.mean(J),
                      distribution_parameters=distribution.get_parameters())

    for i in trange(n_epochs, leave=False):
        core.learn(n_episodes=fit_per_epoch * ep_per_fit,
                   n_episodes_per_fit=ep_per_fit)
        dataset_eval = core.evaluate(n_episodes=ep_per_fit)
        J = compute_J(dataset_eval, gamma=mdp.info.gamma)
        logger.epoch_info(
            i + 1,
            J=np.mean(J),
            distribution_parameters=distribution.get_parameters())
def experiment(n_epochs, n_iterations, ep_per_run, save_states_to_disk):
    np.random.seed()

    logger = Logger('plot_and_norm_example', results_dir=None)
    logger.strong_line()
    logger.info('Plotting and normalization example')

    # MDP
    mdp = LQR.generate(dimensions=2, max_pos=10., max_action=5., episodic=True)

    approximator = Regressor(LinearApproximator,
                             input_shape=mdp.info.observation_space.shape,
                             output_shape=mdp.info.action_space.shape)

    sigma = Regressor(LinearApproximator,
                      input_shape=mdp.info.observation_space.shape,
                      output_shape=mdp.info.action_space.shape)

    sigma_weights = 2 * np.ones(sigma.weights_size)
    sigma.set_weights(sigma_weights)

    policy = StateStdGaussianPolicy(approximator, sigma)

    # Agent
    optimizer = AdaptiveOptimizer(eps=.01)
    algorithm_params = dict(optimizer=optimizer)
    agent = REINFORCE(mdp.info, policy, **algorithm_params)

    # normalization callback
    prepro = MinMaxPreprocessor(mdp_info=mdp.info)

    # plotting callback
    plotter = PlotDataset(mdp.info, obs_normalized=True)

    # Train
    core = Core(agent, mdp, callback_step=plotter, preprocessors=[prepro])

    # training loop
    for n in range(n_epochs):
        core.learn(n_episodes=n_iterations * ep_per_run,
                   n_episodes_per_fit=ep_per_run)
        dataset = core.evaluate(n_episodes=ep_per_run, render=False)
        J = np.mean(compute_J(dataset, mdp.info.gamma))
        logger.epoch_info(n + 1, J=J)

    if save_states_to_disk:
        # save normalization / plot states to disk path
        logger.info('Saving plotting and normalization data')
        os.makedirs("./logs/plot_and_norm", exist_ok=True)
        prepro.save("./logs/plot_and_norm/preprocessor.msh")
        plotter.save_state("./logs/plot_and_norm/plotting_state")

        # load states from disk path
        logger.info('Loading preprocessor and plotter')
        prerpo = MinMaxPreprocessor.load(
            "./logs/plot_and_norm/preprocessor.msh")
        plotter.load_state("./logs/plot_and_norm/plotting_state")
def experiment():
    np.random.seed()

    logger = Logger(QLearning.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + QLearning.__name__)

    # MDP
    mdp = generate_simple_chain(state_n=5,
                                goal_states=[2],
                                prob=.8,
                                rew=1,
                                gamma=.9)

    # Policy
    epsilon = Parameter(value=.15)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = Parameter(value=.2)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = QLearning(mdp.info, pi, **algorithm_params)

    # Core
    core = Core(agent, mdp)

    # Initial policy Evaluation
    dataset = core.evaluate(n_steps=1000)
    J = np.mean(compute_J(dataset, mdp.info.gamma))
    logger.info(f'J start: {J}')

    # Train
    core.learn(n_steps=10000, n_steps_per_fit=1)

    # Final Policy Evaluation
    dataset = core.evaluate(n_steps=1000)
    J = np.mean(compute_J(dataset, mdp.info.gamma))
    logger.info(f'J final: {J}')
Example #11
0
    # Test
    test_epsilon = Parameter(0.)
    agent.policy.set_epsilon(test_epsilon)

    initial_states = np.zeros((289, 2))
    cont = 0
    for i in range(-8, 9):
        for j in range(-8, 9):
            initial_states[cont, :] = [0.125 * i, 0.375 * j]
            cont += 1

    dataset = core.evaluate(initial_states=initial_states)

    # Render
    core.evaluate(n_episodes=3, render=True)

    return np.mean(compute_J(dataset, mdp.info.gamma))


if __name__ == '__main__':
    n_experiment = 1

    logger = Logger(FQI.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + FQI.__name__)

    Js = Parallel(n_jobs=-1)(delayed(experiment)()
                             for _ in range(n_experiment))
    logger.info((np.mean(Js)))
Example #12
0
    args = vars(parser.parse_args())
    return args.values()


if __name__ == '__main__':
    env_ids, exec_type, test, demo = get_args()
    cfg_dir = Path(__file__).parent / 'cfg'
    env_cfg_dir = cfg_dir / 'env'

    param_path = 'suite.yaml'
    plots_path = 'plots.yaml'

    logger = Logger(results_dir=None)

    logger.info('Starting benchmarking script')

    if 'all' in env_ids:
        logger.info('Running benchmark on all available environments')
        assert len(env_ids) == 1
        env_ids = list()
        for env_id in env_cfg_dir.iterdir():
            if env_id.suffix == '.yaml':
                env_ids.append(env_id.stem)

    logger.info('Execution type: ' + exec_type)
    logger.info('Runing FULL: ' + str(not demo))
    logger.strong_line()

    with open(cfg_dir / param_path, 'r') as param_file:
        suite_params = yaml.safe_load(param_file)['suite_params']
Example #13
0
def experiment(alg, n_epochs, n_steps, n_steps_test):
    np.random.seed()

    logger = Logger(alg.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + alg.__name__)

    use_cuda = torch.cuda.is_available()

    # MDP
    horizon = 200
    gamma = 0.99
    mdp = Gym('Pendulum-v1', horizon, gamma)

    # Policy
    policy_class = OrnsteinUhlenbeckPolicy
    policy_params = dict(sigma=np.ones(1) * .2, theta=.15, dt=1e-2)

    # Settings
    initial_replay_size = 500
    max_replay_size = 5000
    batch_size = 200
    n_features = 80
    tau = .001

    # Approximator
    actor_input_shape = mdp.info.observation_space.shape
    actor_params = dict(network=ActorNetwork,
                        n_features=n_features,
                        input_shape=actor_input_shape,
                        output_shape=mdp.info.action_space.shape,
                        use_cuda=use_cuda)

    actor_optimizer = {'class': optim.Adam,
                       'params': {'lr': .001}}

    critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0],)
    critic_params = dict(network=CriticNetwork,
                         optimizer={'class': optim.Adam,
                                    'params': {'lr': .001}},
                         loss=F.mse_loss,
                         n_features=n_features,
                         input_shape=critic_input_shape,
                         output_shape=(1,),
                         use_cuda=use_cuda)

    # Agent
    agent = alg(mdp.info, policy_class, policy_params,
                actor_params, actor_optimizer, critic_params, batch_size,
                initial_replay_size, max_replay_size, tau)

    # Algorithm
    core = Core(agent, mdp)

    core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size)

    # RUN
    dataset = core.evaluate(n_steps=n_steps_test, render=False)
    J = np.mean(compute_J(dataset, gamma))
    R = np.mean(compute_J(dataset))

    logger.epoch_info(0, J=J, R=R)

    for n in trange(n_epochs, leave=False):
        core.learn(n_steps=n_steps, n_steps_per_fit=1)
        dataset = core.evaluate(n_steps=n_steps_test, render=False)
        J = np.mean(compute_J(dataset, gamma))
        R = np.mean(compute_J(dataset))

        logger.epoch_info(n+1, J=J, R=R)

    logger.info('Press a button to visualize pendulum')
    input()
    core.evaluate(n_episodes=5, render=True)
Example #14
0
def experiment():
    np.random.seed()

    logger = Logger(DQN.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + DQN.__name__)

    # Argument parser
    parser = argparse.ArgumentParser()

    arg_game = parser.add_argument_group('Game')
    arg_game.add_argument("--name",
                          type=str,
                          default='BreakoutDeterministic-v4',
                          help='Gym ID of the Atari game.')
    arg_game.add_argument("--screen-width", type=int, default=84,
                          help='Width of the game screen.')
    arg_game.add_argument("--screen-height", type=int, default=84,
                          help='Height of the game screen.')

    arg_mem = parser.add_argument_group('Replay Memory')
    arg_mem.add_argument("--initial-replay-size", type=int, default=50000,
                         help='Initial size of the replay memory.')
    arg_mem.add_argument("--max-replay-size", type=int, default=500000,
                         help='Max size of the replay memory.')
    arg_mem.add_argument("--prioritized", action='store_true',
                         help='Whether to use prioritized memory or not.')

    arg_net = parser.add_argument_group('Deep Q-Network')
    arg_net.add_argument("--optimizer",
                         choices=['adadelta',
                                  'adam',
                                  'rmsprop',
                                  'rmspropcentered'],
                         default='rmsprop',
                         help='Name of the optimizer to use.')
    arg_net.add_argument("--learning-rate", type=float, default=.00025,
                         help='Learning rate value of the optimizer.')
    arg_net.add_argument("--decay", type=float, default=.95,
                         help='Discount factor for the history coming from the'
                              'gradient momentum in rmspropcentered and'
                              'rmsprop')
    arg_net.add_argument("--epsilon", type=float, default=1e-8,
                         help='Epsilon term used in rmspropcentered and'
                              'rmsprop')

    arg_alg = parser.add_argument_group('Algorithm')
    arg_alg.add_argument("--algorithm", choices=['dqn', 'ddqn', 'adqn', 'mmdqn',
                                                 'cdqn', 'dueldqn'],
                         default='dqn',
                         help='Name of the algorithm. dqn is for standard'
                              'DQN, ddqn is for Double DQN and adqn is for'
                              'Averaged DQN.')
    arg_alg.add_argument("--n-approximators", type=int, default=1,
                         help="Number of approximators used in the ensemble for"
                              "AveragedDQN or MaxminDQN.")
    arg_alg.add_argument("--batch-size", type=int, default=32,
                         help='Batch size for each fit of the network.')
    arg_alg.add_argument("--history-length", type=int, default=4,
                         help='Number of frames composing a state.')
    arg_alg.add_argument("--target-update-frequency", type=int, default=10000,
                         help='Number of collected samples before each update'
                              'of the target network.')
    arg_alg.add_argument("--evaluation-frequency", type=int, default=250000,
                         help='Number of collected samples before each'
                              'evaluation. An epoch ends after this number of'
                              'steps')
    arg_alg.add_argument("--train-frequency", type=int, default=4,
                         help='Number of collected samples before each fit of'
                              'the neural network.')
    arg_alg.add_argument("--max-steps", type=int, default=50000000,
                         help='Total number of collected samples.')
    arg_alg.add_argument("--final-exploration-frame", type=int, default=1000000,
                         help='Number of collected samples until the exploration'
                              'rate stops decreasing.')
    arg_alg.add_argument("--initial-exploration-rate", type=float, default=1.,
                         help='Initial value of the exploration rate.')
    arg_alg.add_argument("--final-exploration-rate", type=float, default=.1,
                         help='Final value of the exploration rate. When it'
                              'reaches this values, it stays constant.')
    arg_alg.add_argument("--test-exploration-rate", type=float, default=.05,
                         help='Exploration rate used during evaluation.')
    arg_alg.add_argument("--test-samples", type=int, default=125000,
                         help='Number of collected samples for each'
                              'evaluation.')
    arg_alg.add_argument("--max-no-op-actions", type=int, default=30,
                         help='Maximum number of no-op actions performed at the'
                              'beginning of the episodes.')
    arg_alg.add_argument("--n-atoms", type=int, default=51,
                         help='Number of atoms for Categorical DQN.')
    arg_alg.add_argument("--v-min", type=int, default=-10,
                         help='Minimum action-value for Categorical DQN.')
    arg_alg.add_argument("--v-max", type=int, default=10,
                         help='Maximum action-value for Categorical DQN.')

    arg_utils = parser.add_argument_group('Utils')
    arg_utils.add_argument('--use-cuda', action='store_true',
                           help='Flag specifying whether to use the GPU.')
    arg_utils.add_argument('--save', action='store_true',
                           help='Flag specifying whether to save the model.')
    arg_utils.add_argument('--load-path', type=str,
                           help='Path of the model to be loaded.')
    arg_utils.add_argument('--render', action='store_true',
                           help='Flag specifying whether to render the game.')
    arg_utils.add_argument('--quiet', action='store_true',
                           help='Flag specifying whether to hide the progress'
                                'bar.')
    arg_utils.add_argument('--debug', action='store_true',
                           help='Flag specifying whether the script has to be'
                                'run in debug mode.')

    args = parser.parse_args()

    scores = list()

    optimizer = dict()
    if args.optimizer == 'adam':
        optimizer['class'] = optim.Adam
        optimizer['params'] = dict(lr=args.learning_rate,
                                   eps=args.epsilon)
    elif args.optimizer == 'adadelta':
        optimizer['class'] = optim.Adadelta
        optimizer['params'] = dict(lr=args.learning_rate,
                                   eps=args.epsilon)
    elif args.optimizer == 'rmsprop':
        optimizer['class'] = optim.RMSprop
        optimizer['params'] = dict(lr=args.learning_rate,
                                   alpha=args.decay,
                                   eps=args.epsilon)
    elif args.optimizer == 'rmspropcentered':
        optimizer['class'] = optim.RMSprop
        optimizer['params'] = dict(lr=args.learning_rate,
                                   alpha=args.decay,
                                   eps=args.epsilon,
                                   centered=True)
    else:
        raise ValueError

    # Summary folder
    folder_name = './logs/atari_' + args.algorithm + '_' + args.name +\
        '_' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    pathlib.Path(folder_name).mkdir(parents=True)

    # Settings
    if args.debug:
        initial_replay_size = 50
        max_replay_size = 500
        train_frequency = 5
        target_update_frequency = 10
        test_samples = 20
        evaluation_frequency = 50
        max_steps = 1000
    else:
        initial_replay_size = args.initial_replay_size
        max_replay_size = args.max_replay_size
        train_frequency = args.train_frequency
        target_update_frequency = args.target_update_frequency
        test_samples = args.test_samples
        evaluation_frequency = args.evaluation_frequency
        max_steps = args.max_steps

    # MDP
    mdp = Atari(args.name, args.screen_width, args.screen_height,
                ends_at_life=True, history_length=args.history_length,
                max_no_op_actions=args.max_no_op_actions)

    if args.load_path:
        # Agent
        agent = DQN.load(args.load_path)
        epsilon_test = Parameter(value=args.test_exploration_rate)
        agent.policy.set_epsilon(epsilon_test)

        # Algorithm
        core_test = Core(agent, mdp)

        # Evaluate model
        dataset = core_test.evaluate(n_steps=args.test_samples,
                                     render=args.render,
                                     quiet=args.quiet)
        get_stats(dataset)

    else:
        # Policy
        epsilon = LinearParameter(value=args.initial_exploration_rate,
                                  threshold_value=args.final_exploration_rate,
                                  n=args.final_exploration_frame)
        epsilon_test = Parameter(value=args.test_exploration_rate)
        epsilon_random = Parameter(value=1)
        pi = EpsGreedy(epsilon=epsilon_random)

        class CategoricalLoss(nn.Module):
            def forward(self, input, target):
                input = input.clamp(1e-5)

                return -torch.sum(target * torch.log(input))

        # Approximator
        approximator_params = dict(
            network=Network if args.algorithm not in ['dueldqn', 'cdqn'] else FeatureNetwork,
            input_shape=mdp.info.observation_space.shape,
            output_shape=(mdp.info.action_space.n,),
            n_actions=mdp.info.action_space.n,
            n_features=Network.n_features,
            optimizer=optimizer,
            loss=F.smooth_l1_loss if args.algorithm != 'cdqn' else CategoricalLoss(),
            use_cuda=args.use_cuda
        )

        approximator = TorchApproximator

        if args.prioritized:
            replay_memory = PrioritizedReplayMemory(
                initial_replay_size, max_replay_size, alpha=.6,
                beta=LinearParameter(.4, threshold_value=1,
                                     n=max_steps // train_frequency)
            )
        else:
            replay_memory = None

        # Agent
        algorithm_params = dict(
            batch_size=args.batch_size,
            target_update_frequency=target_update_frequency // train_frequency,
            replay_memory=replay_memory,
            initial_replay_size=initial_replay_size,
            max_replay_size=max_replay_size
        )

        if args.algorithm == 'dqn':
            agent = DQN(mdp.info, pi, approximator,
                        approximator_params=approximator_params,
                        **algorithm_params)
        elif args.algorithm == 'ddqn':
            agent = DoubleDQN(mdp.info, pi, approximator,
                              approximator_params=approximator_params,
                              **algorithm_params)
        elif args.algorithm == 'adqn':
            agent = AveragedDQN(mdp.info, pi, approximator,
                                approximator_params=approximator_params,
                                n_approximators=args.n_approximators,
                                **algorithm_params)
        elif args.algorithm == 'mmdqn':
            agent = MaxminDQN(mdp.info, pi, approximator,
                              approximator_params=approximator_params,
                              n_approximators=args.n_approximators,
                              **algorithm_params)
        elif args.algorithm == 'dueldqn':
            agent = DuelingDQN(mdp.info, pi,
                               approximator_params=approximator_params,
                               **algorithm_params)
        elif args.algorithm == 'cdqn':
            agent = CategoricalDQN(mdp.info, pi,
                                   approximator_params=approximator_params,
                                   n_atoms=args.n_atoms, v_min=args.v_min,
                                   v_max=args.v_max, **algorithm_params)

        # Algorithm
        core = Core(agent, mdp)

        # RUN

        # Fill replay memory with random dataset
        print_epoch(0, logger)
        core.learn(n_steps=initial_replay_size,
                   n_steps_per_fit=initial_replay_size, quiet=args.quiet)

        if args.save:
            agent.save(folder_name + '/agent_0.msh')

        # Evaluate initial policy
        pi.set_epsilon(epsilon_test)
        mdp.set_episode_end(False)
        dataset = core.evaluate(n_steps=test_samples, render=args.render,
                                quiet=args.quiet)
        scores.append(get_stats(dataset, logger))

        np.save(folder_name + '/scores.npy', scores)
        for n_epoch in range(1, max_steps // evaluation_frequency + 1):
            print_epoch(n_epoch, logger)
            logger.info('- Learning:')
            # learning step
            pi.set_epsilon(epsilon)
            mdp.set_episode_end(True)
            core.learn(n_steps=evaluation_frequency,
                       n_steps_per_fit=train_frequency, quiet=args.quiet)

            if args.save:
                agent.save(folder_name + '/agent_' + str(n_epoch) + '.msh')

            logger.info('- Evaluation:')
            # evaluation step
            pi.set_epsilon(epsilon_test)
            mdp.set_episode_end(False)
            dataset = core.evaluate(n_steps=test_samples, render=args.render,
                                    quiet=args.quiet)
            scores.append(get_stats(dataset, logger))

            np.save(folder_name + '/scores.npy', scores)

    return scores
Example #15
0
    # Train
    core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True)

    _, _, reward, _, _, _ = parse_dataset(collect_dataset.get())
    max_Qs = collect_max_Q.get()

    return reward, max_Qs


if __name__ == '__main__':
    n_experiment = 10000

    logger = Logger(QLearning.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + QLearning.__name__)

    names = {
        1: '1',
        .8: '08',
        QLearning: 'Q',
        DoubleQLearning: 'DQ',
        WeightedQLearning: 'WQ',
        SpeedyQLearning: 'SPQ',
        SARSA: 'SARSA'
    }

    for e in [1, .8]:
        logger.info(f'Exp: {e}')
        fig = plt.figure()
        plt.suptitle(names[e])
Example #16
0
def experiment():
    np.random.seed()

    # Argument parser
    parser = argparse.ArgumentParser()

    arg_mem = parser.add_argument_group('Replay Memory')
    arg_mem.add_argument("--initial-replay-size",
                         type=int,
                         default=50000,
                         help='Initial size of the replay memory.')
    arg_mem.add_argument("--max-replay-size",
                         type=int,
                         default=500000,
                         help='Max size of the replay memory.')
    arg_mem.add_argument("--prioritized",
                         action='store_true',
                         help='Whether to use prioritized memory or not.')

    arg_net = parser.add_argument_group('Deep Q-Network')
    arg_net.add_argument(
        "--optimizer",
        choices=['adadelta', 'adam', 'rmsprop', 'rmspropcentered'],
        default='adam',
        help='Name of the optimizer to use.')
    arg_net.add_argument("--learning-rate",
                         type=float,
                         default=.0001,
                         help='Learning rate value of the optimizer.')
    arg_net.add_argument("--decay",
                         type=float,
                         default=.95,
                         help='Discount factor for the history coming from the'
                         'gradient momentum in rmspropcentered and'
                         'rmsprop')
    arg_net.add_argument("--epsilon",
                         type=float,
                         default=1e-8,
                         help='Epsilon term used in rmspropcentered and'
                         'rmsprop')

    arg_alg = parser.add_argument_group('Algorithm')
    arg_alg.add_argument("--algorithm",
                         choices=[
                             'dqn', 'ddqn', 'adqn', 'mmdqn', 'cdqn', 'dueldqn',
                             'ndqn', 'rainbow'
                         ],
                         default='dqn',
                         help='Name of the algorithm. dqn is for standard'
                         'DQN, ddqn is for Double DQN and adqn is for'
                         'Averaged DQN.')
    arg_alg.add_argument(
        "--n-approximators",
        type=int,
        default=1,
        help="Number of approximators used in the ensemble for"
        "AveragedDQN or MaxminDQN.")
    arg_alg.add_argument("--batch-size",
                         type=int,
                         default=32,
                         help='Batch size for each fit of the network.')
    arg_alg.add_argument("--history-length",
                         type=int,
                         default=4,
                         help='Number of frames composing a state.')
    arg_alg.add_argument("--target-update-frequency",
                         type=int,
                         default=10000,
                         help='Number of collected samples before each update'
                         'of the target network.')
    arg_alg.add_argument("--evaluation-frequency",
                         type=int,
                         default=250000,
                         help='Number of collected samples before each'
                         'evaluation. An epoch ends after this number of'
                         'steps')
    arg_alg.add_argument("--train-frequency",
                         type=int,
                         default=4,
                         help='Number of collected samples before each fit of'
                         'the neural network.')
    arg_alg.add_argument("--max-steps",
                         type=int,
                         default=5000000,
                         help='Total number of collected samples.')
    arg_alg.add_argument(
        "--final-exploration-frame",
        type=int,
        default=10000000,
        help='Number of collected samples until the exploration'
        'rate stops decreasing.')
    arg_alg.add_argument("--initial-exploration-rate",
                         type=float,
                         default=1.,
                         help='Initial value of the exploration rate.')
    arg_alg.add_argument("--final-exploration-rate",
                         type=float,
                         default=.1,
                         help='Final value of the exploration rate. When it'
                         'reaches this values, it stays constant.')
    arg_alg.add_argument("--test-exploration-rate",
                         type=float,
                         default=.05,
                         help='Exploration rate used during evaluation.')
    arg_alg.add_argument("--test-episodes",
                         type=int,
                         default=5,
                         help='Number of episodes for each evaluation.')
    arg_alg.add_argument(
        "--alpha-coeff",
        type=float,
        default=.6,
        help='Prioritization exponent for prioritized experience replay.')
    arg_alg.add_argument("--n-atoms",
                         type=int,
                         default=51,
                         help='Number of atoms for Categorical DQN.')
    arg_alg.add_argument("--v-min",
                         type=int,
                         default=-10,
                         help='Minimum action-value for Categorical DQN.')
    arg_alg.add_argument("--v-max",
                         type=int,
                         default=10,
                         help='Maximum action-value for Categorical DQN.')
    arg_alg.add_argument("--n-steps-return",
                         type=int,
                         default=3,
                         help='Number of steps for n-step return for Rainbow.')
    arg_alg.add_argument("--sigma-coeff",
                         type=float,
                         default=.5,
                         help='Sigma0 coefficient for noise initialization in'
                         'NoisyDQN and Rainbow.')

    arg_utils = parser.add_argument_group('Utils')
    arg_utils.add_argument('--use-cuda',
                           action='store_true',
                           help='Flag specifying whether to use the GPU.')
    arg_utils.add_argument('--save',
                           action='store_true',
                           help='Flag specifying whether to save the model.')
    arg_utils.add_argument('--load-path',
                           type=str,
                           help='Path of the model to be loaded.')
    arg_utils.add_argument('--render',
                           action='store_true',
                           help='Flag specifying whether to render the grid.')
    arg_utils.add_argument('--quiet',
                           action='store_true',
                           help='Flag specifying whether to hide the progress'
                           'bar.')
    arg_utils.add_argument('--debug',
                           action='store_true',
                           help='Flag specifying whether the script has to be'
                           'run in debug mode.')

    args = parser.parse_args()

    scores = list()

    optimizer = dict()
    if args.optimizer == 'adam':
        optimizer['class'] = optim.Adam
        optimizer['params'] = dict(lr=args.learning_rate, eps=args.epsilon)
    elif args.optimizer == 'adadelta':
        optimizer['class'] = optim.Adadelta
        optimizer['params'] = dict(lr=args.learning_rate, eps=args.epsilon)
    elif args.optimizer == 'rmsprop':
        optimizer['class'] = optim.RMSprop
        optimizer['params'] = dict(lr=args.learning_rate,
                                   alpha=args.decay,
                                   eps=args.epsilon)
    elif args.optimizer == 'rmspropcentered':
        optimizer['class'] = optim.RMSprop
        optimizer['params'] = dict(lr=args.learning_rate,
                                   alpha=args.decay,
                                   eps=args.epsilon,
                                   centered=True)
    else:
        raise ValueError

    # Summary folder
    folder_name = './logs/habitat_nav_' + args.algorithm +\
        '_' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    pathlib.Path(folder_name).mkdir(parents=True)

    # Settings
    if args.debug:
        initial_replay_size = 50
        max_replay_size = 500
        train_frequency = 5
        target_update_frequency = 10
        test_episodes = 20
        evaluation_frequency = 50
        max_steps = 1000
    else:
        initial_replay_size = args.initial_replay_size
        max_replay_size = args.max_replay_size
        train_frequency = args.train_frequency
        target_update_frequency = args.target_update_frequency
        test_episodes = args.test_episodes
        evaluation_frequency = args.evaluation_frequency
        max_steps = args.max_steps

    # MDP
    config_file = os.path.join(
        pathlib.Path(__file__).parent.resolve(),
        'pointnav_apartment-0.yaml')  # Custom task for Replica scenes
    wrapper = 'HabitatNavigationWrapper'
    mdp = Habitat(wrapper, config_file)
    opt_return = mdp.env.get_optimal_policy_return()

    if args.load_path:
        logger = Logger(DQN.__name__, results_dir=None)
        logger.strong_line()
        logger.info('Optimal Policy Undiscounted Return: ' + str(opt_return))
        logger.info('Experiment Algorithm: ' + DQN.__name__)

        # Agent
        agent = DQN.load(args.load_path)
        epsilon_test = Parameter(value=args.test_exploration_rate)
        agent.policy.set_epsilon(epsilon_test)

        # Algorithm
        core_test = Core(agent, mdp)

        # Evaluate model
        dataset = core_test.evaluate(n_episodes=args.test_episodes,
                                     render=args.render,
                                     quiet=args.quiet)
        get_stats(dataset, logger)
    else:
        # Policy
        epsilon = LinearParameter(value=args.initial_exploration_rate,
                                  threshold_value=args.final_exploration_rate,
                                  n=args.final_exploration_frame)
        epsilon_test = Parameter(value=args.test_exploration_rate)
        epsilon_random = Parameter(value=1)
        pi = EpsGreedy(epsilon=epsilon_random)

        # Approximator
        approximator_params = dict(
            network=Network if args.algorithm
            not in ['dueldqn', 'cdqn', 'ndqn', 'rainbow'] else FeatureNetwork,
            input_shape=mdp.info.observation_space.shape,
            output_shape=(mdp.info.action_space.n, ),
            n_actions=mdp.info.action_space.n,
            n_features=Network.n_features,
            optimizer=optimizer,
            use_cuda=args.use_cuda)
        if args.algorithm not in ['cdqn', 'rainbow']:
            approximator_params['loss'] = F.smooth_l1_loss

        approximator = TorchApproximator

        if args.prioritized:
            replay_memory = PrioritizedReplayMemory(
                initial_replay_size,
                max_replay_size,
                alpha=args.alpha_coeff,
                beta=LinearParameter(.4,
                                     threshold_value=1,
                                     n=max_steps // train_frequency))
        else:
            replay_memory = None

        # Agent
        algorithm_params = dict(
            batch_size=args.batch_size,
            target_update_frequency=target_update_frequency // train_frequency,
            replay_memory=replay_memory,
            initial_replay_size=initial_replay_size,
            max_replay_size=max_replay_size)

        if args.algorithm == 'dqn':
            alg = DQN
            agent = alg(mdp.info,
                        pi,
                        approximator,
                        approximator_params=approximator_params,
                        **algorithm_params)
        elif args.algorithm == 'ddqn':
            alg = DoubleDQN
            agent = alg(mdp.info,
                        pi,
                        approximator,
                        approximator_params=approximator_params,
                        **algorithm_params)
        elif args.algorithm == 'adqn':
            alg = AveragedDQN
            agent = alg(mdp.info,
                        pi,
                        approximator,
                        approximator_params=approximator_params,
                        n_approximators=args.n_approximators,
                        **algorithm_params)
        elif args.algorithm == 'mmdqn':
            alg = MaxminDQN
            agent = alg(mdp.info,
                        pi,
                        approximator,
                        approximator_params=approximator_params,
                        n_approximators=args.n_approximators,
                        **algorithm_params)
        elif args.algorithm == 'dueldqn':
            alg = DuelingDQN
            agent = alg(mdp.info,
                        pi,
                        approximator_params=approximator_params,
                        **algorithm_params)
        elif args.algorithm == 'cdqn':
            alg = CategoricalDQN
            agent = alg(mdp.info,
                        pi,
                        approximator_params=approximator_params,
                        n_atoms=args.n_atoms,
                        v_min=args.v_min,
                        v_max=args.v_max,
                        **algorithm_params)
        elif args.algorithm == 'ndqn':
            alg = NoisyDQN
            agent = alg(mdp.info,
                        pi,
                        approximator_params=approximator_params,
                        sigma_coeff=args.sigma_coeff,
                        **algorithm_params)
        elif args.algorithm == 'rainbow':
            alg = Rainbow
            beta = LinearParameter(.4,
                                   threshold_value=1,
                                   n=max_steps // train_frequency)
            agent = alg(mdp.info,
                        pi,
                        approximator_params=approximator_params,
                        n_atoms=args.n_atoms,
                        v_min=args.v_min,
                        v_max=args.v_max,
                        n_steps_return=args.n_steps_return,
                        alpha_coeff=args.alpha_coeff,
                        beta=beta,
                        sigma_coeff=args.sigma_coeff,
                        **algorithm_params)

        logger = Logger(alg.__name__, results_dir=None)
        logger.strong_line()
        logger.info('Optimal Policy Undiscounted Return: ' + str(opt_return))
        logger.info('Experiment Algorithm: ' + alg.__name__)

        # Algorithm
        core = Core(agent, mdp)

        # RUN

        # Fill replay memory with random dataset
        print_epoch(0, logger)
        core.learn(n_steps=initial_replay_size,
                   n_steps_per_fit=initial_replay_size,
                   quiet=args.quiet)

        if args.save:
            agent.save(folder_name + '/agent_0.msh')

        # Evaluate initial policy
        pi.set_epsilon(epsilon_test)
        dataset = core.evaluate(n_episodes=test_episodes,
                                render=args.render,
                                quiet=args.quiet)
        scores.append(get_stats(dataset, logger))

        np.save(folder_name + '/scores.npy', scores)
        for n_epoch in range(1, max_steps // evaluation_frequency + 1):
            print_epoch(n_epoch, logger)
            logger.info('- Learning:')
            # learning step
            pi.set_epsilon(epsilon)
            core.learn(n_steps=evaluation_frequency,
                       n_steps_per_fit=train_frequency,
                       quiet=args.quiet)

            if args.save:
                agent.save(folder_name + '/agent_' + str(n_epoch) + '.msh')

            logger.info('- Evaluation:')
            # evaluation step
            pi.set_epsilon(epsilon_test)
            dataset = core.evaluate(n_episodes=test_episodes,
                                    render=args.render,
                                    quiet=args.quiet)
            scores.append(get_stats(dataset, logger))

            np.save(folder_name + '/scores.npy', scores)

    return scores
Example #17
0
# Create a logger object, creating a log folder
logger = Logger('tutorial', results_dir='/tmp/logs',
                log_console=True)

# Create a logger object, without creating the log folder
logger_no_folder = Logger('tutorial_no_folder', results_dir=None)

# Write a line of hashtags, to be used as a separator
logger.strong_line()

# Print an info message
logger.debug('This is a debug message')

# Print an info message
logger.info('This is an info message')

# Print a warning
logger.warning('This is a warning message')

# Print an error
logger.error('This is an error message')

# Print a critical error message
logger.critical('This is a critical error')

# Print a line of dashes, to be used as a (weak) separator
logger.weak_line()

# Exception logging
try:
Example #18
0
    agent = LSPI(mdp.info, pi, approximator_params=approximator_params,
                 fit_params=fit_params, features=features)

    # Algorithm
    core = Core(agent, mdp)
    core.evaluate(n_episodes=3, render=True)

    # Train
    core.learn(n_episodes=500, n_episodes_per_fit=500)

    # Test
    test_epsilon = Parameter(0.)
    agent.policy.set_epsilon(test_epsilon)

    dataset = core.evaluate(n_episodes=1, quiet=True)

    core.evaluate(n_steps=100, render=True)

    return np.mean(episodes_length(dataset))


if __name__ == '__main__':
    n_experiment = 1

    logger = Logger(LSPI.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + LSPI.__name__)

    steps = experiment()
    logger.info('Final episode length: %d' % steps)
Example #19
0
def experiment(n_epochs, n_steps, n_steps_per_fit, n_step_test):
    np.random.seed()

    logger = Logger(A2C.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + A2C.__name__)

    # MDP
    horizon = 1000
    gamma = 0.99
    gamma_eval = 1.
    mdp = Gym('Acrobot-v1', horizon, gamma)

    # Policy
    policy_params = dict(
        n_features=32,
        use_cuda=False
    )

    beta = Parameter(1e0)
    pi = BoltzmannTorchPolicy(Network,
                              mdp.info.observation_space.shape,
                              (mdp.info.action_space.n,),
                              beta=beta,
                              **policy_params)

    # Agent
    critic_params = dict(network=Network,
                         optimizer={'class': optim.RMSprop,
                                    'params': {'lr': 1e-3,
                                               'eps': 1e-5}},
                         loss=F.mse_loss,
                         n_features=32,
                         batch_size=64,
                         input_shape=mdp.info.observation_space.shape,
                         output_shape=(1,))

    alg_params = dict(actor_optimizer={'class': optim.RMSprop,
                                       'params': {'lr': 1e-3,
                                                  'eps': 3e-3}},
                      critic_params=critic_params,
                      ent_coeff=0.01
                      )

    agent = A2C(mdp.info, pi, **alg_params)

    # Algorithm
    core = Core(agent, mdp)

    core.learn(n_steps=n_steps, n_steps_per_fit=n_steps_per_fit)

    # RUN
    dataset = core.evaluate(n_steps=n_step_test, render=False)
    J = compute_J(dataset, gamma_eval)
    logger.epoch_info(0, J=np.mean(J))

    for n in trange(n_epochs):
        core.learn(n_steps=n_steps, n_steps_per_fit=n_steps_per_fit)
        dataset = core.evaluate(n_steps=n_step_test, render=False)
        J = compute_J(dataset, gamma_eval)
        logger.epoch_info(n+1, J=np.mean(J))

    logger.info('Press a button to visualize acrobot')
    input()
    core.evaluate(n_episodes=5, render=True)
Example #20
0
def experiment(alg, n_epochs, n_steps, n_steps_test):
    np.random.seed()

    logger = Logger(alg.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + alg.__name__)

    # MDP
    horizon = 200
    gamma = 0.99
    mdp = Gym('Pendulum-v1', horizon, gamma)

    # Settings
    initial_replay_size = 64
    max_replay_size = 50000
    batch_size = 64
    n_features = 64
    warmup_transitions = 100
    tau = 0.005
    lr_alpha = 3e-4

    use_cuda = torch.cuda.is_available()

    # Approximator
    actor_input_shape = mdp.info.observation_space.shape
    actor_mu_params = dict(network=ActorNetwork,
                           n_features=n_features,
                           input_shape=actor_input_shape,
                           output_shape=mdp.info.action_space.shape,
                           use_cuda=use_cuda)
    actor_sigma_params = dict(network=ActorNetwork,
                              n_features=n_features,
                              input_shape=actor_input_shape,
                              output_shape=mdp.info.action_space.shape,
                              use_cuda=use_cuda)

    actor_optimizer = {'class': optim.Adam,
                       'params': {'lr': 3e-4}}

    critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0],)
    critic_params = dict(network=CriticNetwork,
                         optimizer={'class': optim.Adam,
                                    'params': {'lr': 3e-4}},
                         loss=F.mse_loss,
                         n_features=n_features,
                         input_shape=critic_input_shape,
                         output_shape=(1,),
                         use_cuda=use_cuda)

    # Agent
    agent = alg(mdp.info, actor_mu_params, actor_sigma_params,
                actor_optimizer, critic_params, batch_size, initial_replay_size,
                max_replay_size, warmup_transitions, tau, lr_alpha,
                critic_fit_params=None)

    # Algorithm
    core = Core(agent, mdp)

    # RUN
    dataset = core.evaluate(n_steps=n_steps_test, render=False)
    s, *_ = parse_dataset(dataset)

    J = np.mean(compute_J(dataset, mdp.info.gamma))
    R = np.mean(compute_J(dataset))
    E = agent.policy.entropy(s)

    logger.epoch_info(0, J=J, R=R, entropy=E)

    core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size)

    for n in trange(n_epochs, leave=False):
        core.learn(n_steps=n_steps, n_steps_per_fit=1)
        dataset = core.evaluate(n_steps=n_steps_test, render=False)
        s, *_ = parse_dataset(dataset)

        J = np.mean(compute_J(dataset, mdp.info.gamma))
        R = np.mean(compute_J(dataset))
        E = agent.policy.entropy(s)

        logger.epoch_info(n+1, J=J, R=R, entropy=E)

    logger.info('Press a button to visualize pendulum')
    input()
    core.evaluate(n_episodes=5, render=True)
Example #21
0
def experiment(n_epochs, n_steps, n_steps_test):
    np.random.seed()

    logger = Logger(DQN.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + DQN.__name__)

    # MDP
    horizon = 1000
    gamma = 0.99
    gamma_eval = 1.
    mdp = Gym('Acrobot-v1', horizon, gamma)

    # Policy
    epsilon = LinearParameter(value=1., threshold_value=.01, n=5000)
    epsilon_test = Parameter(value=0.)
    epsilon_random = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon_random)

    # Settings
    initial_replay_size = 500
    max_replay_size = 5000
    target_update_frequency = 100
    batch_size = 200
    n_features = 80
    train_frequency = 1

    # Approximator
    input_shape = mdp.info.observation_space.shape
    approximator_params = dict(network=Network,
                               optimizer={
                                   'class': optim.Adam,
                                   'params': {
                                       'lr': .001
                                   }
                               },
                               loss=F.smooth_l1_loss,
                               n_features=n_features,
                               input_shape=input_shape,
                               output_shape=mdp.info.action_space.size,
                               n_actions=mdp.info.action_space.n)

    # Agent
    agent = DQN(mdp.info,
                pi,
                TorchApproximator,
                approximator_params=approximator_params,
                batch_size=batch_size,
                initial_replay_size=initial_replay_size,
                max_replay_size=max_replay_size,
                target_update_frequency=target_update_frequency)

    # Algorithm
    core = Core(agent, mdp)

    core.learn(n_steps=initial_replay_size,
               n_steps_per_fit=initial_replay_size)

    # RUN
    pi.set_epsilon(epsilon_test)
    dataset = core.evaluate(n_steps=n_steps_test, render=False)
    J = compute_J(dataset, gamma_eval)
    logger.epoch_info(0, J=np.mean(J))

    for n in trange(n_epochs):
        pi.set_epsilon(epsilon)
        core.learn(n_steps=n_steps, n_steps_per_fit=train_frequency)
        pi.set_epsilon(epsilon_test)
        dataset = core.evaluate(n_steps=n_steps_test, render=False)
        J = compute_J(dataset, gamma_eval)
        logger.epoch_info(n + 1, J=np.mean(J))

    logger.info('Press a button to visualize acrobot')
    input()
    core.evaluate(n_episodes=5, render=True)
    algorithm_params = {'learning_rate': learning_rate, 'lambda_coeff': .9}

    agent = TrueOnlineSARSALambda(mdp.info,
                                  pi,
                                  approximator_params=approximator_params,
                                  features=features,
                                  **algorithm_params)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_episodes=40, n_steps_per_fit=1, render=False)
    dataset = core.evaluate(n_episodes=1, render=True)

    return np.mean(compute_J(dataset, 1.))


if __name__ == '__main__':
    n_experiment = 1

    logger = Logger(TrueOnlineSARSALambda.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + TrueOnlineSARSALambda.__name__)

    alpha = .1
    Js = Parallel(n_jobs=-1)(delayed(experiment)(alpha)
                             for _ in range(n_experiment))

    logger.info('J: %f' % np.mean(Js))
Example #23
0
def experiment(n_epochs, n_episodes):
    np.random.seed()

    logger = Logger(StochasticAC_AVG.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + StochasticAC_AVG.__name__)

    # MDP
    n_steps = 5000
    mdp = InvertedPendulum(horizon=n_steps)

    # Agent
    n_tilings = 11
    alpha_r = Parameter(.0001)
    alpha_theta = Parameter(.001 / n_tilings)
    alpha_v = Parameter(.1 / n_tilings)
    tilings = Tiles.generate(n_tilings - 1, [10, 10],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high + 1e-3)

    phi = Features(tilings=tilings)

    tilings_v = tilings + Tiles.generate(
        1, [1, 1], mdp.info.observation_space.low,
        mdp.info.observation_space.high + 1e-3)
    psi = Features(tilings=tilings_v)

    input_shape = (phi.size, )

    mu = Regressor(LinearApproximator,
                   input_shape=input_shape,
                   output_shape=mdp.info.action_space.shape)

    std = Regressor(LinearApproximator,
                    input_shape=input_shape,
                    output_shape=mdp.info.action_space.shape)

    std_0 = np.sqrt(1.)
    std.set_weights(np.log(std_0) / n_tilings * np.ones(std.weights_size))

    policy = StateLogStdGaussianPolicy(mu, std)

    agent = StochasticAC_AVG(mdp.info,
                             policy,
                             alpha_theta,
                             alpha_v,
                             alpha_r,
                             lambda_par=.5,
                             value_function_features=psi,
                             policy_features=phi)

    # Train
    dataset_callback = CollectDataset()
    display_callback = Display(agent._V, mu, std,
                               mdp.info.observation_space.low,
                               mdp.info.observation_space.high, phi, psi)
    core = Core(agent, mdp, callbacks_fit=[dataset_callback])

    for i in trange(n_epochs, leave=False):
        core.learn(n_episodes=n_episodes, n_steps_per_fit=1, render=False)
        J = compute_J(dataset_callback.get(), gamma=1.)
        dataset_callback.clean()
        display_callback()
        logger.epoch_info(i + 1, R_mean=np.sum(J) / n_steps / n_episodes)

    logger.info('Press a button to visualize the pendulum...')
    input()
    core.evaluate(n_steps=n_steps, render=True)
def experiment(alg, n_epochs, n_steps, n_episodes_test):
    np.random.seed()

    logger = Logger(alg.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + alg.__name__)

    # MDP
    gamma = 0.99
    habitat_root_path = Habitat.root_path()
    config_file = os.path.join(
        habitat_root_path, 'habitat_baselines/config/rearrange/rl_pick.yaml')
    base_config_file = os.path.join(habitat_root_path,
                                    'configs/tasks/rearrange/pick.yaml')
    wrapper = 'HabitatRearrangeWrapper'
    mdp = Habitat(wrapper, config_file, base_config_file, gamma=gamma)

    # Settings
    initial_replay_size = 64
    max_replay_size = 50000
    batch_size = 64
    n_features = 64
    warmup_transitions = 100
    tau = 0.005
    lr_alpha = 3e-4

    use_cuda = torch.cuda.is_available()

    # Approximator
    actor_input_shape = mdp.info.observation_space.shape
    actor_mu_params = dict(network=ActorNetwork,
                           n_features=n_features,
                           input_shape=actor_input_shape,
                           output_shape=mdp.info.action_space.shape,
                           use_cuda=use_cuda)
    actor_sigma_params = dict(network=ActorNetwork,
                              n_features=n_features,
                              input_shape=actor_input_shape,
                              output_shape=mdp.info.action_space.shape,
                              use_cuda=use_cuda)

    actor_optimizer = {'class': optim.Adam, 'params': {'lr': 3e-4}}

    critic_input_shape = actor_input_shape + mdp.info.action_space.shape
    critic_params = dict(network=CriticNetwork,
                         optimizer={
                             'class': optim.Adam,
                             'params': {
                                 'lr': 3e-4
                             }
                         },
                         loss=F.mse_loss,
                         n_features=n_features,
                         input_shape=critic_input_shape,
                         output_shape=(1, ),
                         use_cuda=use_cuda)

    # Agent
    agent = alg(mdp.info,
                actor_mu_params,
                actor_sigma_params,
                actor_optimizer,
                critic_params,
                batch_size,
                initial_replay_size,
                max_replay_size,
                warmup_transitions,
                tau,
                lr_alpha,
                critic_fit_params=None)

    # Algorithm
    core = Core(agent, mdp)

    # RUN
    dataset = core.evaluate(n_episodes=n_episodes_test, render=False)
    s, *_ = parse_dataset(dataset)

    J = np.mean(compute_J(dataset, mdp.info.gamma))
    R = np.mean(compute_J(dataset))
    E = agent.policy.entropy(s)

    logger.epoch_info(0, J=J, R=R, entropy=E)

    core.learn(n_steps=initial_replay_size,
               n_steps_per_fit=initial_replay_size)

    for n in trange(n_epochs, leave=False):
        core.learn(n_steps=n_steps, n_steps_per_fit=1)
        dataset = core.evaluate(n_episodes=n_episodes_test, render=False)
        s, *_ = parse_dataset(dataset)

        J = np.mean(compute_J(dataset, mdp.info.gamma))
        R = np.mean(compute_J(dataset))
        E = agent.policy.entropy(s)

        logger.epoch_info(n + 1, J=J, R=R, entropy=E)

    logger.info('Press a button to visualize the robot')
    input()
    core.evaluate(n_episodes=5, render=True)
Example #25
0
    # Train
    core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True)

    _, _, reward, _, _, _ = parse_dataset(collect_dataset.get())
    max_Qs = collect_max_Q.get()

    return reward, max_Qs


if __name__ == '__main__':
    n_experiment = 10000

    logger = Logger(QLearning.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + QLearning.__name__)

    names = {
        1: '1',
        .8: '08',
        QLearning: 'Q',
        DoubleQLearning: 'DQ',
        WeightedQLearning: 'WQ',
        SpeedyQLearning: 'SPQ',
        SARSA: 'SARSA'
    }

    for e in [1, .8]:
        logger.info('Exp: ', e)
        fig = plt.figure()
        plt.suptitle(names[e])