Exemple #1
0
def experiment():
    np.random.seed()

    # Argument parser
    parser = argparse.ArgumentParser()

    arg_game = parser.add_argument_group('Game')
    arg_game.add_argument("--name",
                          type=str,
                          default='BreakoutDeterministic-v4',
                          help='Gym ID of the Atari game.')
    arg_game.add_argument("--screen-width",
                          type=int,
                          default=84,
                          help='Width of the game screen.')
    arg_game.add_argument("--screen-height",
                          type=int,
                          default=84,
                          help='Height of the game screen.')

    arg_mem = parser.add_argument_group('Replay Memory')
    arg_mem.add_argument("--initial-replay-size",
                         type=int,
                         default=50000,
                         help='Initial size of the replay memory.')
    arg_mem.add_argument("--max-replay-size",
                         type=int,
                         default=500000,
                         help='Max size of the replay memory.')
    arg_mem.add_argument("--prioritized",
                         action='store_true',
                         help='Whether to use prioritized memory or not.')

    arg_net = parser.add_argument_group('Deep Q-Network')
    arg_net.add_argument(
        "--optimizer",
        choices=['adadelta', 'adam', 'rmsprop', 'rmspropcentered'],
        default='rmsprop',
        help='Name of the optimizer to use.')
    arg_net.add_argument("--learning-rate",
                         type=float,
                         default=.00025,
                         help='Learning rate value of the optimizer.')
    arg_net.add_argument("--decay",
                         type=float,
                         default=.95,
                         help='Discount factor for the history coming from the'
                         'gradient momentum in rmspropcentered and'
                         'rmsprop')
    arg_net.add_argument("--epsilon",
                         type=float,
                         default=1e-8,
                         help='Epsilon term used in rmspropcentered and'
                         'rmsprop')

    arg_alg = parser.add_argument_group('Algorithm')
    arg_alg.add_argument("--algorithm",
                         choices=['dqn', 'ddqn', 'adqn', 'mmdqn', 'cdqn'],
                         default='dqn',
                         help='Name of the algorithm. dqn is for standard'
                         'DQN, ddqn is for Double DQN and adqn is for'
                         'Averaged DQN.')
    arg_alg.add_argument(
        "--n-approximators",
        type=int,
        default=1,
        help="Number of approximators used in the ensemble for"
        "AveragedDQN or MaxminDQN.")
    arg_alg.add_argument("--batch-size",
                         type=int,
                         default=32,
                         help='Batch size for each fit of the network.')
    arg_alg.add_argument("--history-length",
                         type=int,
                         default=4,
                         help='Number of frames composing a state.')
    arg_alg.add_argument("--target-update-frequency",
                         type=int,
                         default=10000,
                         help='Number of collected samples before each update'
                         'of the target network.')
    arg_alg.add_argument("--evaluation-frequency",
                         type=int,
                         default=250000,
                         help='Number of collected samples before each'
                         'evaluation. An epoch ends after this number of'
                         'steps')
    arg_alg.add_argument("--train-frequency",
                         type=int,
                         default=4,
                         help='Number of collected samples before each fit of'
                         'the neural network.')
    arg_alg.add_argument("--max-steps",
                         type=int,
                         default=50000000,
                         help='Total number of collected samples.')
    arg_alg.add_argument(
        "--final-exploration-frame",
        type=int,
        default=1000000,
        help='Number of collected samples until the exploration'
        'rate stops decreasing.')
    arg_alg.add_argument("--initial-exploration-rate",
                         type=float,
                         default=1.,
                         help='Initial value of the exploration rate.')
    arg_alg.add_argument("--final-exploration-rate",
                         type=float,
                         default=.1,
                         help='Final value of the exploration rate. When it'
                         'reaches this values, it stays constant.')
    arg_alg.add_argument("--test-exploration-rate",
                         type=float,
                         default=.05,
                         help='Exploration rate used during evaluation.')
    arg_alg.add_argument("--test-samples",
                         type=int,
                         default=125000,
                         help='Number of collected samples for each'
                         'evaluation.')
    arg_alg.add_argument(
        "--max-no-op-actions",
        type=int,
        default=30,
        help='Maximum number of no-op actions performed at the'
        'beginning of the episodes.')
    arg_alg.add_argument("--n-atoms",
                         type=int,
                         default=51,
                         help='Number of atoms for Categorical DQN.')
    arg_alg.add_argument("--v-min",
                         type=int,
                         default=-10,
                         help='Minimum action-value for Categorical DQN.')
    arg_alg.add_argument("--v-max",
                         type=int,
                         default=10,
                         help='Maximum action-value for Categorical DQN.')

    arg_utils = parser.add_argument_group('Utils')
    arg_utils.add_argument('--use-cuda',
                           action='store_true',
                           help='Flag specifying whether to use the GPU.')
    arg_utils.add_argument('--save',
                           action='store_true',
                           help='Flag specifying whether to save the model.')
    arg_utils.add_argument('--load-path',
                           type=str,
                           help='Path of the model to be loaded.')
    arg_utils.add_argument('--render',
                           action='store_true',
                           help='Flag specifying whether to render the game.')
    arg_utils.add_argument('--quiet',
                           action='store_true',
                           help='Flag specifying whether to hide the progress'
                           'bar.')
    arg_utils.add_argument('--debug',
                           action='store_true',
                           help='Flag specifying whether the script has to be'
                           'run in debug mode.')

    args = parser.parse_args()

    scores = list()

    optimizer = dict()
    if args.optimizer == 'adam':
        optimizer['class'] = optim.Adam
        optimizer['params'] = dict(lr=args.learning_rate, eps=args.epsilon)
    elif args.optimizer == 'adadelta':
        optimizer['class'] = optim.Adadelta
        optimizer['params'] = dict(lr=args.learning_rate, eps=args.epsilon)
    elif args.optimizer == 'rmsprop':
        optimizer['class'] = optim.RMSprop
        optimizer['params'] = dict(lr=args.learning_rate,
                                   alpha=args.decay,
                                   eps=args.epsilon)
    elif args.optimizer == 'rmspropcentered':
        optimizer['class'] = optim.RMSprop
        optimizer['params'] = dict(lr=args.learning_rate,
                                   alpha=args.decay,
                                   eps=args.epsilon,
                                   centered=True)
    else:
        raise ValueError

    # Summary folder
    folder_name = './logs/atari_' + args.algorithm + '_' + args.name +\
        '_' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    pathlib.Path(folder_name).mkdir(parents=True)

    # Settings
    if args.debug:
        initial_replay_size = 50
        max_replay_size = 500
        train_frequency = 5
        target_update_frequency = 10
        test_samples = 20
        evaluation_frequency = 50
        max_steps = 1000
    else:
        initial_replay_size = args.initial_replay_size
        max_replay_size = args.max_replay_size
        train_frequency = args.train_frequency
        target_update_frequency = args.target_update_frequency
        test_samples = args.test_samples
        evaluation_frequency = args.evaluation_frequency
        max_steps = args.max_steps

    # MDP
    mdp = Atari(args.name,
                args.screen_width,
                args.screen_height,
                ends_at_life=True,
                history_length=args.history_length,
                max_no_op_actions=args.max_no_op_actions)

    if args.load_path:
        # Agent
        agent = DQN.load(args.load_path)
        epsilon_test = Parameter(value=args.test_exploration_rate)
        agent.policy.set_epsilon(epsilon_test)

        # Algorithm
        core_test = Core(agent, mdp)

        # Evaluate model
        dataset = core_test.evaluate(n_steps=args.test_samples,
                                     render=args.render,
                                     quiet=args.quiet)
        get_stats(dataset)

    else:
        # Policy
        epsilon = LinearParameter(value=args.initial_exploration_rate,
                                  threshold_value=args.final_exploration_rate,
                                  n=args.final_exploration_frame)
        epsilon_test = Parameter(value=args.test_exploration_rate)
        epsilon_random = Parameter(value=1)
        pi = EpsGreedy(epsilon=epsilon_random)

        class CategoricalLoss(nn.Module):
            def forward(self, input, target):
                input = input.clamp(1e-5)

                return -torch.sum(target * torch.log(input))

        # Approximator
        approximator_params = dict(
            network=Network if args.algorithm != 'cdqn' else FeatureNetwork,
            input_shape=mdp.info.observation_space.shape,
            output_shape=(mdp.info.action_space.n, ),
            n_actions=mdp.info.action_space.n,
            n_features=Network.n_features,
            optimizer=optimizer,
            loss=F.smooth_l1_loss
            if args.algorithm != 'cdqn' else CategoricalLoss(),
            use_cuda=args.use_cuda)

        approximator = TorchApproximator

        if args.prioritized:
            replay_memory = PrioritizedReplayMemory(
                initial_replay_size,
                max_replay_size,
                alpha=.6,
                beta=LinearParameter(.4,
                                     threshold_value=1,
                                     n=max_steps // train_frequency))
        else:
            replay_memory = None

        # Agent
        algorithm_params = dict(
            batch_size=args.batch_size,
            target_update_frequency=target_update_frequency // train_frequency,
            replay_memory=replay_memory,
            initial_replay_size=initial_replay_size,
            max_replay_size=max_replay_size)

        if args.algorithm == 'dqn':
            agent = DQN(mdp.info,
                        pi,
                        approximator,
                        approximator_params=approximator_params,
                        **algorithm_params)
        elif args.algorithm == 'ddqn':
            agent = DoubleDQN(mdp.info,
                              pi,
                              approximator,
                              approximator_params=approximator_params,
                              **algorithm_params)
        elif args.algorithm == 'adqn':
            agent = AveragedDQN(mdp.info,
                                pi,
                                approximator,
                                approximator_params=approximator_params,
                                n_approximators=args.n_approximators,
                                **algorithm_params)
        elif args.algorithm == 'mmdqn':
            agent = MaxminDQN(mdp.info,
                              pi,
                              approximator,
                              approximator_params=approximator_params,
                              n_approximators=args.n_approximators,
                              **algorithm_params)
        elif args.algorithm == 'cdqn':
            agent = CategoricalDQN(mdp.info,
                                   pi,
                                   approximator_params=approximator_params,
                                   n_atoms=args.n_atoms,
                                   v_min=args.v_min,
                                   v_max=args.v_max,
                                   **algorithm_params)

        # Algorithm
        core = Core(agent, mdp)

        # RUN

        # Fill replay memory with random dataset
        print_epoch(0)
        core.learn(n_steps=initial_replay_size,
                   n_steps_per_fit=initial_replay_size,
                   quiet=args.quiet)

        if args.save:
            agent.save(folder_name + '/agent_0.msh')

        # Evaluate initial policy
        pi.set_epsilon(epsilon_test)
        mdp.set_episode_end(False)
        dataset = core.evaluate(n_steps=test_samples,
                                render=args.render,
                                quiet=args.quiet)
        scores.append(get_stats(dataset))

        np.save(folder_name + '/scores.npy', scores)
        for n_epoch in range(1, max_steps // evaluation_frequency + 1):
            print_epoch(n_epoch)
            print('- Learning:')
            # learning step
            pi.set_epsilon(epsilon)
            mdp.set_episode_end(True)
            core.learn(n_steps=evaluation_frequency,
                       n_steps_per_fit=train_frequency,
                       quiet=args.quiet)

            if args.save:
                agent.save(folder_name + '/agent_' + str(n_epoch) + '.msh')

            print('- Evaluation:')
            # evaluation step
            pi.set_epsilon(epsilon_test)
            mdp.set_episode_end(False)
            dataset = core.evaluate(n_steps=test_samples,
                                    render=args.render,
                                    quiet=args.quiet)
            scores.append(get_stats(dataset))

            np.save(folder_name + '/scores.npy', scores)

    return scores
Exemple #2
0
# Opponent
nme = Greedy(battle_format='gen8randombattle',
                start_timer_on_battle_start=True)

# Training loop
# Fill replay memory with random dataset
print_epoch(0)
mdp.start_battles(nme)
# core.learn(n_steps=initial_replay_size,
#           n_steps_per_fit=initial_replay_size)
core.learn(n_episodes=initial_replay_size,
        n_episodes_per_fit=initial_replay_size)
mdp.end_battles()
# Evaluate initial policy
pi.set_epsilon(epsilon_test)
# mdp.set_episode_end(False)
mdp.start_battles(nme)
dataset = core.evaluate(n_episodes=test_episodes)
mdp.end_battles()
scores.append(get_stats(dataset))

N_STEPS = max_steps // evaluation_frequency + 1
for n_epoch in range(1, N_STEPS):
    if n_epoch % 5 == 0 or n_epoch == N_STEPS-1:
        torch.save(core.agent.approximator.model.network, f'checkpoints/torch/checkpt_epoch{n_epoch}')
        core.agent.save(f'checkpoints/mushroom/checkpt_epoch{n_epoch}')
    print_epoch(n_epoch)
    print('- Learning:')
    # learning step
    pi.set_epsilon(epsilon)
Exemple #3
0
def experiment():
    np.random.seed()

    # Argument parser
    parser = argparse.ArgumentParser()

    arg_mem = parser.add_argument_group('Replay Memory')
    arg_mem.add_argument("--initial-replay-size",
                         type=int,
                         default=50000,
                         help='Initial size of the replay memory.')
    arg_mem.add_argument("--max-replay-size",
                         type=int,
                         default=500000,
                         help='Max size of the replay memory.')
    arg_mem.add_argument("--prioritized",
                         action='store_true',
                         help='Whether to use prioritized memory or not.')

    arg_net = parser.add_argument_group('Deep Q-Network')
    arg_net.add_argument(
        "--optimizer",
        choices=['adadelta', 'adam', 'rmsprop', 'rmspropcentered'],
        default='adam',
        help='Name of the optimizer to use.')
    arg_net.add_argument("--learning-rate",
                         type=float,
                         default=.0001,
                         help='Learning rate value of the optimizer.')
    arg_net.add_argument("--decay",
                         type=float,
                         default=.95,
                         help='Discount factor for the history coming from the'
                         'gradient momentum in rmspropcentered and'
                         'rmsprop')
    arg_net.add_argument("--epsilon",
                         type=float,
                         default=1e-8,
                         help='Epsilon term used in rmspropcentered and'
                         'rmsprop')

    arg_alg = parser.add_argument_group('Algorithm')
    arg_alg.add_argument("--algorithm",
                         choices=[
                             'dqn', 'ddqn', 'adqn', 'mmdqn', 'cdqn', 'dueldqn',
                             'ndqn', 'rainbow'
                         ],
                         default='dqn',
                         help='Name of the algorithm. dqn is for standard'
                         'DQN, ddqn is for Double DQN and adqn is for'
                         'Averaged DQN.')
    arg_alg.add_argument(
        "--n-approximators",
        type=int,
        default=1,
        help="Number of approximators used in the ensemble for"
        "AveragedDQN or MaxminDQN.")
    arg_alg.add_argument("--batch-size",
                         type=int,
                         default=32,
                         help='Batch size for each fit of the network.')
    arg_alg.add_argument("--history-length",
                         type=int,
                         default=4,
                         help='Number of frames composing a state.')
    arg_alg.add_argument("--target-update-frequency",
                         type=int,
                         default=10000,
                         help='Number of collected samples before each update'
                         'of the target network.')
    arg_alg.add_argument("--evaluation-frequency",
                         type=int,
                         default=250000,
                         help='Number of collected samples before each'
                         'evaluation. An epoch ends after this number of'
                         'steps')
    arg_alg.add_argument("--train-frequency",
                         type=int,
                         default=4,
                         help='Number of collected samples before each fit of'
                         'the neural network.')
    arg_alg.add_argument("--max-steps",
                         type=int,
                         default=5000000,
                         help='Total number of collected samples.')
    arg_alg.add_argument(
        "--final-exploration-frame",
        type=int,
        default=10000000,
        help='Number of collected samples until the exploration'
        'rate stops decreasing.')
    arg_alg.add_argument("--initial-exploration-rate",
                         type=float,
                         default=1.,
                         help='Initial value of the exploration rate.')
    arg_alg.add_argument("--final-exploration-rate",
                         type=float,
                         default=.1,
                         help='Final value of the exploration rate. When it'
                         'reaches this values, it stays constant.')
    arg_alg.add_argument("--test-exploration-rate",
                         type=float,
                         default=.05,
                         help='Exploration rate used during evaluation.')
    arg_alg.add_argument("--test-episodes",
                         type=int,
                         default=5,
                         help='Number of episodes for each evaluation.')
    arg_alg.add_argument(
        "--alpha-coeff",
        type=float,
        default=.6,
        help='Prioritization exponent for prioritized experience replay.')
    arg_alg.add_argument("--n-atoms",
                         type=int,
                         default=51,
                         help='Number of atoms for Categorical DQN.')
    arg_alg.add_argument("--v-min",
                         type=int,
                         default=-10,
                         help='Minimum action-value for Categorical DQN.')
    arg_alg.add_argument("--v-max",
                         type=int,
                         default=10,
                         help='Maximum action-value for Categorical DQN.')
    arg_alg.add_argument("--n-steps-return",
                         type=int,
                         default=3,
                         help='Number of steps for n-step return for Rainbow.')
    arg_alg.add_argument("--sigma-coeff",
                         type=float,
                         default=.5,
                         help='Sigma0 coefficient for noise initialization in'
                         'NoisyDQN and Rainbow.')

    arg_utils = parser.add_argument_group('Utils')
    arg_utils.add_argument('--use-cuda',
                           action='store_true',
                           help='Flag specifying whether to use the GPU.')
    arg_utils.add_argument('--save',
                           action='store_true',
                           help='Flag specifying whether to save the model.')
    arg_utils.add_argument('--load-path',
                           type=str,
                           help='Path of the model to be loaded.')
    arg_utils.add_argument('--render',
                           action='store_true',
                           help='Flag specifying whether to render the grid.')
    arg_utils.add_argument('--quiet',
                           action='store_true',
                           help='Flag specifying whether to hide the progress'
                           'bar.')
    arg_utils.add_argument('--debug',
                           action='store_true',
                           help='Flag specifying whether the script has to be'
                           'run in debug mode.')

    args = parser.parse_args()

    scores = list()

    optimizer = dict()
    if args.optimizer == 'adam':
        optimizer['class'] = optim.Adam
        optimizer['params'] = dict(lr=args.learning_rate, eps=args.epsilon)
    elif args.optimizer == 'adadelta':
        optimizer['class'] = optim.Adadelta
        optimizer['params'] = dict(lr=args.learning_rate, eps=args.epsilon)
    elif args.optimizer == 'rmsprop':
        optimizer['class'] = optim.RMSprop
        optimizer['params'] = dict(lr=args.learning_rate,
                                   alpha=args.decay,
                                   eps=args.epsilon)
    elif args.optimizer == 'rmspropcentered':
        optimizer['class'] = optim.RMSprop
        optimizer['params'] = dict(lr=args.learning_rate,
                                   alpha=args.decay,
                                   eps=args.epsilon,
                                   centered=True)
    else:
        raise ValueError

    # Summary folder
    folder_name = './logs/habitat_nav_' + args.algorithm +\
        '_' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    pathlib.Path(folder_name).mkdir(parents=True)

    # Settings
    if args.debug:
        initial_replay_size = 50
        max_replay_size = 500
        train_frequency = 5
        target_update_frequency = 10
        test_episodes = 20
        evaluation_frequency = 50
        max_steps = 1000
    else:
        initial_replay_size = args.initial_replay_size
        max_replay_size = args.max_replay_size
        train_frequency = args.train_frequency
        target_update_frequency = args.target_update_frequency
        test_episodes = args.test_episodes
        evaluation_frequency = args.evaluation_frequency
        max_steps = args.max_steps

    # MDP
    config_file = os.path.join(
        pathlib.Path(__file__).parent.resolve(),
        'pointnav_apartment-0.yaml')  # Custom task for Replica scenes
    wrapper = 'HabitatNavigationWrapper'
    mdp = Habitat(wrapper, config_file)
    opt_return = mdp.env.get_optimal_policy_return()

    if args.load_path:
        logger = Logger(DQN.__name__, results_dir=None)
        logger.strong_line()
        logger.info('Optimal Policy Undiscounted Return: ' + str(opt_return))
        logger.info('Experiment Algorithm: ' + DQN.__name__)

        # Agent
        agent = DQN.load(args.load_path)
        epsilon_test = Parameter(value=args.test_exploration_rate)
        agent.policy.set_epsilon(epsilon_test)

        # Algorithm
        core_test = Core(agent, mdp)

        # Evaluate model
        dataset = core_test.evaluate(n_episodes=args.test_episodes,
                                     render=args.render,
                                     quiet=args.quiet)
        get_stats(dataset, logger)
    else:
        # Policy
        epsilon = LinearParameter(value=args.initial_exploration_rate,
                                  threshold_value=args.final_exploration_rate,
                                  n=args.final_exploration_frame)
        epsilon_test = Parameter(value=args.test_exploration_rate)
        epsilon_random = Parameter(value=1)
        pi = EpsGreedy(epsilon=epsilon_random)

        # Approximator
        approximator_params = dict(
            network=Network if args.algorithm
            not in ['dueldqn', 'cdqn', 'ndqn', 'rainbow'] else FeatureNetwork,
            input_shape=mdp.info.observation_space.shape,
            output_shape=(mdp.info.action_space.n, ),
            n_actions=mdp.info.action_space.n,
            n_features=Network.n_features,
            optimizer=optimizer,
            use_cuda=args.use_cuda)
        if args.algorithm not in ['cdqn', 'rainbow']:
            approximator_params['loss'] = F.smooth_l1_loss

        approximator = TorchApproximator

        if args.prioritized:
            replay_memory = PrioritizedReplayMemory(
                initial_replay_size,
                max_replay_size,
                alpha=args.alpha_coeff,
                beta=LinearParameter(.4,
                                     threshold_value=1,
                                     n=max_steps // train_frequency))
        else:
            replay_memory = None

        # Agent
        algorithm_params = dict(
            batch_size=args.batch_size,
            target_update_frequency=target_update_frequency // train_frequency,
            replay_memory=replay_memory,
            initial_replay_size=initial_replay_size,
            max_replay_size=max_replay_size)

        if args.algorithm == 'dqn':
            alg = DQN
            agent = alg(mdp.info,
                        pi,
                        approximator,
                        approximator_params=approximator_params,
                        **algorithm_params)
        elif args.algorithm == 'ddqn':
            alg = DoubleDQN
            agent = alg(mdp.info,
                        pi,
                        approximator,
                        approximator_params=approximator_params,
                        **algorithm_params)
        elif args.algorithm == 'adqn':
            alg = AveragedDQN
            agent = alg(mdp.info,
                        pi,
                        approximator,
                        approximator_params=approximator_params,
                        n_approximators=args.n_approximators,
                        **algorithm_params)
        elif args.algorithm == 'mmdqn':
            alg = MaxminDQN
            agent = alg(mdp.info,
                        pi,
                        approximator,
                        approximator_params=approximator_params,
                        n_approximators=args.n_approximators,
                        **algorithm_params)
        elif args.algorithm == 'dueldqn':
            alg = DuelingDQN
            agent = alg(mdp.info,
                        pi,
                        approximator_params=approximator_params,
                        **algorithm_params)
        elif args.algorithm == 'cdqn':
            alg = CategoricalDQN
            agent = alg(mdp.info,
                        pi,
                        approximator_params=approximator_params,
                        n_atoms=args.n_atoms,
                        v_min=args.v_min,
                        v_max=args.v_max,
                        **algorithm_params)
        elif args.algorithm == 'ndqn':
            alg = NoisyDQN
            agent = alg(mdp.info,
                        pi,
                        approximator_params=approximator_params,
                        sigma_coeff=args.sigma_coeff,
                        **algorithm_params)
        elif args.algorithm == 'rainbow':
            alg = Rainbow
            beta = LinearParameter(.4,
                                   threshold_value=1,
                                   n=max_steps // train_frequency)
            agent = alg(mdp.info,
                        pi,
                        approximator_params=approximator_params,
                        n_atoms=args.n_atoms,
                        v_min=args.v_min,
                        v_max=args.v_max,
                        n_steps_return=args.n_steps_return,
                        alpha_coeff=args.alpha_coeff,
                        beta=beta,
                        sigma_coeff=args.sigma_coeff,
                        **algorithm_params)

        logger = Logger(alg.__name__, results_dir=None)
        logger.strong_line()
        logger.info('Optimal Policy Undiscounted Return: ' + str(opt_return))
        logger.info('Experiment Algorithm: ' + alg.__name__)

        # Algorithm
        core = Core(agent, mdp)

        # RUN

        # Fill replay memory with random dataset
        print_epoch(0, logger)
        core.learn(n_steps=initial_replay_size,
                   n_steps_per_fit=initial_replay_size,
                   quiet=args.quiet)

        if args.save:
            agent.save(folder_name + '/agent_0.msh')

        # Evaluate initial policy
        pi.set_epsilon(epsilon_test)
        dataset = core.evaluate(n_episodes=test_episodes,
                                render=args.render,
                                quiet=args.quiet)
        scores.append(get_stats(dataset, logger))

        np.save(folder_name + '/scores.npy', scores)
        for n_epoch in range(1, max_steps // evaluation_frequency + 1):
            print_epoch(n_epoch, logger)
            logger.info('- Learning:')
            # learning step
            pi.set_epsilon(epsilon)
            core.learn(n_steps=evaluation_frequency,
                       n_steps_per_fit=train_frequency,
                       quiet=args.quiet)

            if args.save:
                agent.save(folder_name + '/agent_' + str(n_epoch) + '.msh')

            logger.info('- Evaluation:')
            # evaluation step
            pi.set_epsilon(epsilon_test)
            dataset = core.evaluate(n_episodes=test_episodes,
                                    render=args.render,
                                    quiet=args.quiet)
            scores.append(get_stats(dataset, logger))

            np.save(folder_name + '/scores.npy', scores)

    return scores
Exemple #4
0
def experiment(n_epochs, n_steps, n_steps_test):
    np.random.seed()

    # MDP
    horizon = 1000
    gamma = 0.99
    gamma_eval = 1.
    mdp = Gym('Acrobot-v1', horizon, gamma)

    # Policy
    epsilon = LinearParameter(value=1., threshold_value=.01, n=5000)
    epsilon_test = Parameter(value=0.)
    epsilon_random = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon_random)

    # Settings
    initial_replay_size = 500
    max_replay_size = 5000
    target_update_frequency = 100
    batch_size = 200
    n_features = 80
    train_frequency = 1

    # Approximator
    input_shape = mdp.info.observation_space.shape
    approximator_params = dict(network=Network,
                               optimizer={
                                   'class': optim.Adam,
                                   'params': {
                                       'lr': .001
                                   }
                               },
                               loss=F.smooth_l1_loss,
                               n_features=n_features,
                               input_shape=input_shape,
                               output_shape=mdp.info.action_space.size,
                               n_actions=mdp.info.action_space.n)

    # Agent
    agent = DQN(mdp.info,
                pi,
                TorchApproximator,
                approximator_params=approximator_params,
                batch_size=batch_size,
                n_approximators=1,
                initial_replay_size=initial_replay_size,
                max_replay_size=max_replay_size,
                target_update_frequency=target_update_frequency)

    # Algorithm
    core = Core(agent, mdp)

    core.learn(n_steps=initial_replay_size,
               n_steps_per_fit=initial_replay_size)

    # RUN
    pi.set_epsilon(epsilon_test)
    dataset = core.evaluate(n_steps=n_steps_test, render=False)
    J = compute_J(dataset, gamma_eval)
    print('J: ', np.mean(J))

    for n in trange(n_epochs):
        tqdm.write('Epoch: ' + str(n))
        pi.set_epsilon(epsilon)
        core.learn(n_steps=n_steps, n_steps_per_fit=train_frequency)
        pi.set_epsilon(epsilon_test)
        dataset = core.evaluate(n_steps=n_steps_test, render=False)
        J = compute_J(dataset, gamma_eval)
        tqdm.write('J: ' + str(np.mean(J)))

    print('Press a button to visualize acrobot')
    input()
    core.evaluate(n_episodes=5, render=True)
mdp = CarOnHill()

# Policy
epsilon = Parameter(value=1.)
pi = EpsGreedy(epsilon=epsilon)

# Approximator
approximator_params = dict(input_shape=mdp.info.observation_space.shape,
                           n_actions=mdp.info.action_space.n,
                           n_estimators=50,
                           min_samples_split=5,
                           min_samples_leaf=2)
approximator = ExtraTreesRegressor

# Agent
agent = FQI(approximator,
            pi,
            mdp.info,
            n_iterations=20,
            approximator_params=approximator_params)

core = Core(agent, mdp)

core.learn(n_episodes=1000, n_episodes_per_fit=1000)

pi.set_epsilon(Parameter(0.))
initial_state = np.array([[-.5, 0.]])
dataset = core.evaluate(initial_states=initial_state)

print(compute_J(dataset, gamma=mdp.info.gamma))
def experiment(mdp, params, prob=None):
    # Argument parser
    # parser = argparse.ArgumentParser()
    #
    # args = parser.parse_args()

    scores = list()

    optimizer = dict()
    if params['optimizer'] == 'adam':
        optimizer['class'] = optim.Adam
        optimizer['params'] = dict(lr=params['learning_rate'])
    elif params['optimizer'] == 'adadelta':
        optimizer['class'] = optim.Adadelta
        optimizer['params'] = dict(lr=params['learning_rate'])
    elif params['optimizer'] == 'rmsprop':
        optimizer['class'] = optim.RMSprop
        optimizer['params'] = dict(lr=params['learning_rate'],
                                   alpha=params['decay'])
    elif params['optimizer'] == 'rmspropcentered':
        optimizer['class'] = optim.RMSprop
        optimizer['params'] = dict(lr=params['learning_rate'],
                                   alpha=params['decay'],
                                   centered=True)
    else:
        raise ValueError

    # DQN learning run

    # Summary folder
    folder_name = os.path.join(PROJECT_DIR, 'logs', params['name'])
    if params['save']:
        pathlib.Path(folder_name).mkdir(parents=True)

    # Policy
    epsilon = ExponentialParameter(value=params['initial_exploration_rate'],
                                   exp=params['exploration_rate'],
                                   min_value=params['final_exploration_rate'],
                                   size=(1, ))

    epsilon_random = Parameter(value=1)
    epsilon_test = Parameter(value=0.01)
    pi = EpsGreedy(epsilon=epsilon_random)

    class CategoricalLoss(nn.Module):
        def forward(self, input, target):
            input = input.clamp(1e-5)

            return -torch.sum(target * torch.log(input))

    # Approximator
    input_shape = mdp.observation.shape

    resources = [[
        (mdp.north - mdp.devices[device][0]) / (mdp.north - mdp.south),
        (mdp.east - mdp.devices[device][1]) / (mdp.east - mdp.west)
    ] for device in mdp.device_ordering]
    edges = [[
        (mdp.north - mdp.graph.nodes[e[0]]['y']) / (mdp.north - mdp.south),
        (mdp.east - mdp.graph.nodes[e[0]]['x']) / (mdp.east - mdp.west),
        (mdp.north - mdp.graph.nodes[e[1]]['y']) / (mdp.north - mdp.south),
        (mdp.east - mdp.graph.nodes[e[1]]['x']) / (mdp.east - mdp.west)
    ] for e in mdp.graph.edges]

    N = {
        'SimpleResourceNetwork': SimpleResourceNetwork,
        'GraphConvolutionResourceNetwork': GraphConvolutionResourceNetwork,
    }[params['network']]
    N.n_features = params['hidden']

    approximator_params = dict(
        network=N,
        input_shape=input_shape,
        edges=edges,
        resources=resources,
        graph=mdp.graph,
        allow_wait=params['allow_wait'],
        long_term_q=params['long_term_q'],
        resource_embeddings=params['resource_embeddings'],
        edge_ordering=mdp.edge_ordering,
        device_ordering=mdp.device_ordering,
        resource_edges=mdp.resource_edges,
        output_shape=(mdp.info.action_space.n, ),
        n_actions=mdp.info.action_space.n,
        n_features=params['hidden'],
        optimizer=optimizer,
        loss=F.smooth_l1_loss,
        nn_scaling=params['nn_scaling'],
        # quiet=False,
        use_cuda=params['cuda'],
        load_path=params.get('load_path', None))

    approximator = TorchApproximator

    replay_memory = PrioritizedReplayMemory(
        params['initial_replay_size'],
        params['max_replay_size'],
        alpha=.6,
        beta=LinearParameter(.4,
                             threshold_value=1,
                             n=params['max_steps'] //
                             params['train_frequency']))

    # Agent
    algorithm_params = dict(
        batch_size=params['batch_size'],
        n_approximators=1,
        target_update_frequency=params['target_update_frequency'] //
        params['train_frequency'],
        replay_memory=replay_memory,
        initial_replay_size=params['initial_replay_size'],
        max_replay_size=params['max_replay_size'])

    clz = DoubleDQN if mdp.info.gamma >= 1 else SMDPDQN
    agent = clz(mdp.info,
                pi,
                approximator,
                approximator_params=approximator_params,
                **algorithm_params)

    lr_scheduler = torch.optim.lr_scheduler.StepLR(
        agent.approximator._impl.model._optimizer,
        step_size=1,
        gamma=params['lr_decay'],
        last_epoch=-1)  # params['max_steps'] // params['train_frequency']

    # Algorithm
    core = Core(agent, mdp)

    if 'weights' in params:
        best_weights = np.load(params['weights'])
        agent.approximator.set_weights(best_weights)
        agent.target_approximator.set_weights(best_weights)
    else:
        best_weights = agent.approximator.get_weights()

    # RUN
    pi.set_epsilon(epsilon_test)
    eval_days = [i for i in range(1, 356) if i % 13 == 1]
    ds = core.evaluate(initial_states=eval_days,
                       quiet=tuning,
                       render=params['save'])
    test_result = np.mean(compute_J(ds))
    test_result_discounted = np.mean(compute_J(ds, params['gamma']))
    print("discounted validation result", test_result_discounted)
    print("validation result", test_result)
    results = [(0, 0, test_result_discounted, test_result)]
    if params['save']:
        mdp.save_rendered(folder_name + "/epoch_init.mp4")

    # Fill replay memory with random dataset
    print_epoch(0)
    start = time()
    core.learn(n_steps=params['initial_replay_size'],
               n_steps_per_fit=params['initial_replay_size'],
               quiet=tuning)

    runtime = time() - start
    steps = 0

    if params['save']:
        with open(folder_name + "/params.json", "w") as f:
            json.dump(params, f, indent=4)
        if isinstance(agent, DQN):
            np.save(folder_name + '/weights-exp-0-0.npy',
                    agent.approximator.get_weights())

    best_score = -np.inf
    no_improvement = 0
    patience = 6

    if params['save']:
        np.save(folder_name + '/scores.npy', scores)
    for n_epoch in range(
            1, int(params['max_steps'] // params['evaluation_frequency'] + 1)):
        print_epoch(n_epoch)
        print('- Learning:')
        # learning step
        pi.set_epsilon(epsilon)
        # mdp.set_episode_end(True)
        start = time()
        core.learn(n_steps=params['evaluation_frequency'],
                   n_steps_per_fit=params['train_frequency'],
                   quiet=tuning)
        runtime += time() - start
        steps += params['evaluation_frequency']
        lr_scheduler.step()

        if params['save']:
            if isinstance(agent, DQN):
                np.save(
                    folder_name + '/weights-exp-0-' + str(n_epoch) + '.npy',
                    agent.approximator.get_weights())

        print('- Evaluation:')
        # evaluation step
        pi.set_epsilon(epsilon_test)
        ds = core.evaluate(initial_states=eval_days,
                           render=params['save'],
                           quiet=tuning)
        test_result_discounted = np.mean(compute_J(ds, params['gamma']))
        test_result = np.mean(compute_J(ds))
        print("discounted validation result", test_result_discounted)
        print("validation result", test_result)

        if params['save']:
            mdp.save_rendered(folder_name + ("/epoch%04d.mp4" % n_epoch))
        results.append((runtime, steps, test_result_discounted, test_result))

        if params['save']:
            np.savetxt(folder_name + '/scores.csv',
                       np.asarray(results),
                       delimiter=',')

        if test_result > best_score:
            no_improvement = 0
            best_score = test_result
            best_weights = agent.approximator.get_weights().copy()

            with open(folder_name + "/best_val.txt", "w") as f:
                f.write("%f" % test_result)
        else:
            no_improvement += 1
            if no_improvement >= patience:
                break

    print('---------- FINAL EVALUATION ---------')
    agent.approximator.set_weights(best_weights)
    agent.target_approximator.set_weights(best_weights)
    pi.set_epsilon(epsilon_test)
    eval_days = [i for i in range(1, 356) if i % 13 == 0]
    ds = core.evaluate(initial_states=eval_days,
                       render=params['save'],
                       quiet=tuning)
    test_result_discounted = np.mean(compute_J(ds, params['gamma']))
    test_result = np.mean(compute_J(ds))
    print("discounted test result", test_result_discounted)
    print("test result", test_result)

    with open(folder_name + "/test_result.txt", "w") as f:
        f.write("%f" % test_result)

    if params['save']:
        mdp.save_rendered(folder_name + "/epoch_test.mp4", 10000)

    return scores