Beispiel #1
0
def test_prioritized_dqn_save():
    replay_memory = PrioritizedReplayMemory(50,
                                            500,
                                            alpha=.6,
                                            beta=LinearParameter(
                                                .4,
                                                threshold_value=1,
                                                n=500 // 5))
    params = dict(batch_size=50,
                  n_approximators=1,
                  initial_replay_size=50,
                  max_replay_size=500,
                  target_update_frequency=50,
                  replay_memory=replay_memory)
    agent_save = learn(DQN, params)

    agent_path = './agentdir{}/'.format(datetime.now().strftime("%H%M%S%f"))

    agent_save.save(agent_path)
    agent_load = Agent.load(agent_path)

    shutil.rmtree(agent_path)

    for att, method in agent_save.__dict__.items():
        save_attr = getattr(agent_save, att)
        load_attr = getattr(agent_load, att)
        #print('{}: {}'.format(att, type(save_attr)))

        tu.assert_eq(save_attr, load_attr)
Beispiel #2
0
def test_prioritized_dqn_save(tmpdir):
    agent_path = tmpdir / 'agent_{}'.format(
        datetime.now().strftime("%H%M%S%f"))

    replay_memory = PrioritizedReplayMemory(50,
                                            500,
                                            alpha=.6,
                                            beta=LinearParameter(
                                                .4,
                                                threshold_value=1,
                                                n=500 // 5))
    params = dict(batch_size=50,
                  initial_replay_size=50,
                  max_replay_size=500,
                  target_update_frequency=50,
                  replay_memory=replay_memory)
    agent_save = learn(DQN, params)

    agent_save.save(agent_path, full_save=True)
    agent_load = Agent.load(agent_path)

    for att, method in vars(agent_save).items():
        save_attr = getattr(agent_save, att)
        load_attr = getattr(agent_load, att)

        tu.assert_eq(save_attr, load_attr)
Beispiel #3
0
    def __init__(self,
                 mdp_info,
                 policy,
                 approximator_params,
                 n_atoms,
                 v_min,
                 v_max,
                 n_steps_return,
                 alpha_coeff,
                 beta,
                 sigma_coeff=.5,
                 **params):
        """
        Constructor.

        Args:
            n_atoms (int): number of atoms;
            v_min (float): minimum value of value-function;
            v_max (float): maximum value of value-function;
            n_steps_return (int): the number of steps to consider to compute the n-return;
            alpha_coeff (float): prioritization exponent for prioritized experience replay;
            beta (Parameter): importance sampling coefficient for prioritized experience replay;
            sigma_coeff (float, .5): sigma0 coefficient for noise initialization in noisy layers.

        """
        features_network = approximator_params['network']
        params['approximator_params'] = deepcopy(approximator_params)
        params['approximator_params']['network'] = RainbowNetwork
        params['approximator_params']['features_network'] = features_network
        params['approximator_params']['n_atoms'] = n_atoms
        params['approximator_params']['v_min'] = v_min
        params['approximator_params']['v_max'] = v_max
        params['approximator_params']['sigma_coeff'] = sigma_coeff
        params['approximator_params']['loss'] = categorical_loss

        self._n_atoms = n_atoms
        self._v_min = v_min
        self._v_max = v_max
        self._delta = (v_max - v_min) / (n_atoms - 1)
        self._a_values = np.arange(v_min, v_max + self._delta, self._delta)
        self._n_steps_return = n_steps_return
        self._sigma_coeff = sigma_coeff

        params['replay_memory'] = PrioritizedReplayMemory(
            params['initial_replay_size'],
            params['max_replay_size'],
            alpha=alpha_coeff,
            beta=beta)

        self._add_save_attr(_n_atoms='primitive',
                            _v_min='primitive',
                            _v_max='primitive',
                            _delta='primitive',
                            _a_values='numpy',
                            _n_steps_return='primitive',
                            _sigma_coeff='primitive')

        super().__init__(mdp_info, policy, TorchApproximator, **params)
Beispiel #4
0
def test_prioritized_dqn():
    replay_memory = PrioritizedReplayMemory(
        50, 500, alpha=.6,
        beta=LinearParameter(.4, threshold_value=1, n=500 // 5)
    )
    params = dict(batch_size=50, initial_replay_size=50,
                  max_replay_size=500, target_update_frequency=50,
                  replay_memory=replay_memory)
    approximator = learn(DQN, params).approximator

    w = approximator.get_weights()
    w_test = np.array([-0.2410347, 0.39138362, 0.12457055, 0.60612524, -0.54973847,
                       -0.06486652, -0.07349031, 0.4376623, 0.14254288])

    assert np.allclose(w, w_test)
Beispiel #5
0
def test_prioritized_dqn():
    replay_memory = PrioritizedReplayMemory(
        50, 500, alpha=.6,
        beta=LinearParameter(.4, threshold_value=1, n=500 // 5)
    )
    params = dict(batch_size=50, n_approximators=1, initial_replay_size=50,
                  max_replay_size=500, target_update_frequency=50,
                  replay_memory=replay_memory)
    approximator = learn(DQN, params)

    w = approximator.get_weights()
    w_test = np.array([-0.1384063, 0.48957556, 0.02254359, 0.50994426,
                       -0.56277484, -0.075808, -0.06829552, 0.3642576,
                       0.15519235])

    assert np.allclose(w, w_test)
Beispiel #6
0
    def build(self, mdp_info):
        self.approximator_params[
            'input_shape'] = mdp_info.observation_space.shape
        self.approximator_params['output_shape'] = (mdp_info.action_space.n, )
        self.approximator_params['n_actions'] = mdp_info.action_space.n

        replay_memory = PrioritizedReplayMemory(
            self.alg_params['initial_replay_size'],
            self.alg_params['max_replay_size'],
            alpha=.6,
            beta=LinearParameter(.4, threshold_value=1, n=50000000 // 4))
        self.alg_params['replay_memory'] = replay_memory
        self.epsilon = LinearParameter(value=1, threshold_value=.05, n=1000000)
        self.epsilon_test = Parameter(value=.01)

        return DQN(mdp_info, self.policy, self.approximator,
                   self.approximator_params, **self.alg_params)
Beispiel #7
0
def experiment():
    np.random.seed()

    # Argument parser
    parser = argparse.ArgumentParser()

    arg_game = parser.add_argument_group('Game')
    arg_game.add_argument("--name",
                          type=str,
                          default='BreakoutDeterministic-v4',
                          help='Gym ID of the Atari game.')
    arg_game.add_argument("--screen-width",
                          type=int,
                          default=84,
                          help='Width of the game screen.')
    arg_game.add_argument("--screen-height",
                          type=int,
                          default=84,
                          help='Height of the game screen.')

    arg_mem = parser.add_argument_group('Replay Memory')
    arg_mem.add_argument("--initial-replay-size",
                         type=int,
                         default=50000,
                         help='Initial size of the replay memory.')
    arg_mem.add_argument("--max-replay-size",
                         type=int,
                         default=500000,
                         help='Max size of the replay memory.')
    arg_mem.add_argument("--prioritized",
                         action='store_true',
                         help='Whether to use prioritized memory or not.')

    arg_net = parser.add_argument_group('Deep Q-Network')
    arg_net.add_argument(
        "--optimizer",
        choices=['adadelta', 'adam', 'rmsprop', 'rmspropcentered'],
        default='rmsprop',
        help='Name of the optimizer to use.')
    arg_net.add_argument("--learning-rate",
                         type=float,
                         default=.00025,
                         help='Learning rate value of the optimizer.')
    arg_net.add_argument("--decay",
                         type=float,
                         default=.95,
                         help='Discount factor for the history coming from the'
                         'gradient momentum in rmspropcentered and'
                         'rmsprop')
    arg_net.add_argument("--epsilon",
                         type=float,
                         default=1e-8,
                         help='Epsilon term used in rmspropcentered and'
                         'rmsprop')

    arg_alg = parser.add_argument_group('Algorithm')
    arg_alg.add_argument("--algorithm",
                         choices=['dqn', 'ddqn', 'adqn', 'mmdqn', 'cdqn'],
                         default='dqn',
                         help='Name of the algorithm. dqn is for standard'
                         'DQN, ddqn is for Double DQN and adqn is for'
                         'Averaged DQN.')
    arg_alg.add_argument(
        "--n-approximators",
        type=int,
        default=1,
        help="Number of approximators used in the ensemble for"
        "AveragedDQN or MaxminDQN.")
    arg_alg.add_argument("--batch-size",
                         type=int,
                         default=32,
                         help='Batch size for each fit of the network.')
    arg_alg.add_argument("--history-length",
                         type=int,
                         default=4,
                         help='Number of frames composing a state.')
    arg_alg.add_argument("--target-update-frequency",
                         type=int,
                         default=10000,
                         help='Number of collected samples before each update'
                         'of the target network.')
    arg_alg.add_argument("--evaluation-frequency",
                         type=int,
                         default=250000,
                         help='Number of collected samples before each'
                         'evaluation. An epoch ends after this number of'
                         'steps')
    arg_alg.add_argument("--train-frequency",
                         type=int,
                         default=4,
                         help='Number of collected samples before each fit of'
                         'the neural network.')
    arg_alg.add_argument("--max-steps",
                         type=int,
                         default=50000000,
                         help='Total number of collected samples.')
    arg_alg.add_argument(
        "--final-exploration-frame",
        type=int,
        default=1000000,
        help='Number of collected samples until the exploration'
        'rate stops decreasing.')
    arg_alg.add_argument("--initial-exploration-rate",
                         type=float,
                         default=1.,
                         help='Initial value of the exploration rate.')
    arg_alg.add_argument("--final-exploration-rate",
                         type=float,
                         default=.1,
                         help='Final value of the exploration rate. When it'
                         'reaches this values, it stays constant.')
    arg_alg.add_argument("--test-exploration-rate",
                         type=float,
                         default=.05,
                         help='Exploration rate used during evaluation.')
    arg_alg.add_argument("--test-samples",
                         type=int,
                         default=125000,
                         help='Number of collected samples for each'
                         'evaluation.')
    arg_alg.add_argument(
        "--max-no-op-actions",
        type=int,
        default=30,
        help='Maximum number of no-op actions performed at the'
        'beginning of the episodes.')
    arg_alg.add_argument("--n-atoms",
                         type=int,
                         default=51,
                         help='Number of atoms for Categorical DQN.')
    arg_alg.add_argument("--v-min",
                         type=int,
                         default=-10,
                         help='Minimum action-value for Categorical DQN.')
    arg_alg.add_argument("--v-max",
                         type=int,
                         default=10,
                         help='Maximum action-value for Categorical DQN.')

    arg_utils = parser.add_argument_group('Utils')
    arg_utils.add_argument('--use-cuda',
                           action='store_true',
                           help='Flag specifying whether to use the GPU.')
    arg_utils.add_argument('--save',
                           action='store_true',
                           help='Flag specifying whether to save the model.')
    arg_utils.add_argument('--load-path',
                           type=str,
                           help='Path of the model to be loaded.')
    arg_utils.add_argument('--render',
                           action='store_true',
                           help='Flag specifying whether to render the game.')
    arg_utils.add_argument('--quiet',
                           action='store_true',
                           help='Flag specifying whether to hide the progress'
                           'bar.')
    arg_utils.add_argument('--debug',
                           action='store_true',
                           help='Flag specifying whether the script has to be'
                           'run in debug mode.')

    args = parser.parse_args()

    scores = list()

    optimizer = dict()
    if args.optimizer == 'adam':
        optimizer['class'] = optim.Adam
        optimizer['params'] = dict(lr=args.learning_rate, eps=args.epsilon)
    elif args.optimizer == 'adadelta':
        optimizer['class'] = optim.Adadelta
        optimizer['params'] = dict(lr=args.learning_rate, eps=args.epsilon)
    elif args.optimizer == 'rmsprop':
        optimizer['class'] = optim.RMSprop
        optimizer['params'] = dict(lr=args.learning_rate,
                                   alpha=args.decay,
                                   eps=args.epsilon)
    elif args.optimizer == 'rmspropcentered':
        optimizer['class'] = optim.RMSprop
        optimizer['params'] = dict(lr=args.learning_rate,
                                   alpha=args.decay,
                                   eps=args.epsilon,
                                   centered=True)
    else:
        raise ValueError

    # Summary folder
    folder_name = './logs/atari_' + args.algorithm + '_' + args.name +\
        '_' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    pathlib.Path(folder_name).mkdir(parents=True)

    # Settings
    if args.debug:
        initial_replay_size = 50
        max_replay_size = 500
        train_frequency = 5
        target_update_frequency = 10
        test_samples = 20
        evaluation_frequency = 50
        max_steps = 1000
    else:
        initial_replay_size = args.initial_replay_size
        max_replay_size = args.max_replay_size
        train_frequency = args.train_frequency
        target_update_frequency = args.target_update_frequency
        test_samples = args.test_samples
        evaluation_frequency = args.evaluation_frequency
        max_steps = args.max_steps

    # MDP
    mdp = Atari(args.name,
                args.screen_width,
                args.screen_height,
                ends_at_life=True,
                history_length=args.history_length,
                max_no_op_actions=args.max_no_op_actions)

    if args.load_path:
        # Agent
        agent = DQN.load(args.load_path)
        epsilon_test = Parameter(value=args.test_exploration_rate)
        agent.policy.set_epsilon(epsilon_test)

        # Algorithm
        core_test = Core(agent, mdp)

        # Evaluate model
        dataset = core_test.evaluate(n_steps=args.test_samples,
                                     render=args.render,
                                     quiet=args.quiet)
        get_stats(dataset)

    else:
        # Policy
        epsilon = LinearParameter(value=args.initial_exploration_rate,
                                  threshold_value=args.final_exploration_rate,
                                  n=args.final_exploration_frame)
        epsilon_test = Parameter(value=args.test_exploration_rate)
        epsilon_random = Parameter(value=1)
        pi = EpsGreedy(epsilon=epsilon_random)

        class CategoricalLoss(nn.Module):
            def forward(self, input, target):
                input = input.clamp(1e-5)

                return -torch.sum(target * torch.log(input))

        # Approximator
        approximator_params = dict(
            network=Network if args.algorithm != 'cdqn' else FeatureNetwork,
            input_shape=mdp.info.observation_space.shape,
            output_shape=(mdp.info.action_space.n, ),
            n_actions=mdp.info.action_space.n,
            n_features=Network.n_features,
            optimizer=optimizer,
            loss=F.smooth_l1_loss
            if args.algorithm != 'cdqn' else CategoricalLoss(),
            use_cuda=args.use_cuda)

        approximator = TorchApproximator

        if args.prioritized:
            replay_memory = PrioritizedReplayMemory(
                initial_replay_size,
                max_replay_size,
                alpha=.6,
                beta=LinearParameter(.4,
                                     threshold_value=1,
                                     n=max_steps // train_frequency))
        else:
            replay_memory = None

        # Agent
        algorithm_params = dict(
            batch_size=args.batch_size,
            target_update_frequency=target_update_frequency // train_frequency,
            replay_memory=replay_memory,
            initial_replay_size=initial_replay_size,
            max_replay_size=max_replay_size)

        if args.algorithm == 'dqn':
            agent = DQN(mdp.info,
                        pi,
                        approximator,
                        approximator_params=approximator_params,
                        **algorithm_params)
        elif args.algorithm == 'ddqn':
            agent = DoubleDQN(mdp.info,
                              pi,
                              approximator,
                              approximator_params=approximator_params,
                              **algorithm_params)
        elif args.algorithm == 'adqn':
            agent = AveragedDQN(mdp.info,
                                pi,
                                approximator,
                                approximator_params=approximator_params,
                                n_approximators=args.n_approximators,
                                **algorithm_params)
        elif args.algorithm == 'mmdqn':
            agent = MaxminDQN(mdp.info,
                              pi,
                              approximator,
                              approximator_params=approximator_params,
                              n_approximators=args.n_approximators,
                              **algorithm_params)
        elif args.algorithm == 'cdqn':
            agent = CategoricalDQN(mdp.info,
                                   pi,
                                   approximator_params=approximator_params,
                                   n_atoms=args.n_atoms,
                                   v_min=args.v_min,
                                   v_max=args.v_max,
                                   **algorithm_params)

        # Algorithm
        core = Core(agent, mdp)

        # RUN

        # Fill replay memory with random dataset
        print_epoch(0)
        core.learn(n_steps=initial_replay_size,
                   n_steps_per_fit=initial_replay_size,
                   quiet=args.quiet)

        if args.save:
            agent.save(folder_name + '/agent_0.msh')

        # Evaluate initial policy
        pi.set_epsilon(epsilon_test)
        mdp.set_episode_end(False)
        dataset = core.evaluate(n_steps=test_samples,
                                render=args.render,
                                quiet=args.quiet)
        scores.append(get_stats(dataset))

        np.save(folder_name + '/scores.npy', scores)
        for n_epoch in range(1, max_steps // evaluation_frequency + 1):
            print_epoch(n_epoch)
            print('- Learning:')
            # learning step
            pi.set_epsilon(epsilon)
            mdp.set_episode_end(True)
            core.learn(n_steps=evaluation_frequency,
                       n_steps_per_fit=train_frequency,
                       quiet=args.quiet)

            if args.save:
                agent.save(folder_name + '/agent_' + str(n_epoch) + '.msh')

            print('- Evaluation:')
            # evaluation step
            pi.set_epsilon(epsilon_test)
            mdp.set_episode_end(False)
            dataset = core.evaluate(n_steps=test_samples,
                                    render=args.render,
                                    quiet=args.quiet)
            scores.append(get_stats(dataset))

            np.save(folder_name + '/scores.npy', scores)

    return scores
Beispiel #8
0
def experiment():
    np.random.seed()

    # Argument parser
    parser = argparse.ArgumentParser()

    arg_mem = parser.add_argument_group('Replay Memory')
    arg_mem.add_argument("--initial-replay-size",
                         type=int,
                         default=50000,
                         help='Initial size of the replay memory.')
    arg_mem.add_argument("--max-replay-size",
                         type=int,
                         default=500000,
                         help='Max size of the replay memory.')
    arg_mem.add_argument("--prioritized",
                         action='store_true',
                         help='Whether to use prioritized memory or not.')

    arg_net = parser.add_argument_group('Deep Q-Network')
    arg_net.add_argument(
        "--optimizer",
        choices=['adadelta', 'adam', 'rmsprop', 'rmspropcentered'],
        default='adam',
        help='Name of the optimizer to use.')
    arg_net.add_argument("--learning-rate",
                         type=float,
                         default=.0001,
                         help='Learning rate value of the optimizer.')
    arg_net.add_argument("--decay",
                         type=float,
                         default=.95,
                         help='Discount factor for the history coming from the'
                         'gradient momentum in rmspropcentered and'
                         'rmsprop')
    arg_net.add_argument("--epsilon",
                         type=float,
                         default=1e-8,
                         help='Epsilon term used in rmspropcentered and'
                         'rmsprop')

    arg_alg = parser.add_argument_group('Algorithm')
    arg_alg.add_argument("--algorithm",
                         choices=[
                             'dqn', 'ddqn', 'adqn', 'mmdqn', 'cdqn', 'dueldqn',
                             'ndqn', 'rainbow'
                         ],
                         default='dqn',
                         help='Name of the algorithm. dqn is for standard'
                         'DQN, ddqn is for Double DQN and adqn is for'
                         'Averaged DQN.')
    arg_alg.add_argument(
        "--n-approximators",
        type=int,
        default=1,
        help="Number of approximators used in the ensemble for"
        "AveragedDQN or MaxminDQN.")
    arg_alg.add_argument("--batch-size",
                         type=int,
                         default=32,
                         help='Batch size for each fit of the network.')
    arg_alg.add_argument("--history-length",
                         type=int,
                         default=4,
                         help='Number of frames composing a state.')
    arg_alg.add_argument("--target-update-frequency",
                         type=int,
                         default=10000,
                         help='Number of collected samples before each update'
                         'of the target network.')
    arg_alg.add_argument("--evaluation-frequency",
                         type=int,
                         default=250000,
                         help='Number of collected samples before each'
                         'evaluation. An epoch ends after this number of'
                         'steps')
    arg_alg.add_argument("--train-frequency",
                         type=int,
                         default=4,
                         help='Number of collected samples before each fit of'
                         'the neural network.')
    arg_alg.add_argument("--max-steps",
                         type=int,
                         default=5000000,
                         help='Total number of collected samples.')
    arg_alg.add_argument(
        "--final-exploration-frame",
        type=int,
        default=10000000,
        help='Number of collected samples until the exploration'
        'rate stops decreasing.')
    arg_alg.add_argument("--initial-exploration-rate",
                         type=float,
                         default=1.,
                         help='Initial value of the exploration rate.')
    arg_alg.add_argument("--final-exploration-rate",
                         type=float,
                         default=.1,
                         help='Final value of the exploration rate. When it'
                         'reaches this values, it stays constant.')
    arg_alg.add_argument("--test-exploration-rate",
                         type=float,
                         default=.05,
                         help='Exploration rate used during evaluation.')
    arg_alg.add_argument("--test-episodes",
                         type=int,
                         default=5,
                         help='Number of episodes for each evaluation.')
    arg_alg.add_argument(
        "--alpha-coeff",
        type=float,
        default=.6,
        help='Prioritization exponent for prioritized experience replay.')
    arg_alg.add_argument("--n-atoms",
                         type=int,
                         default=51,
                         help='Number of atoms for Categorical DQN.')
    arg_alg.add_argument("--v-min",
                         type=int,
                         default=-10,
                         help='Minimum action-value for Categorical DQN.')
    arg_alg.add_argument("--v-max",
                         type=int,
                         default=10,
                         help='Maximum action-value for Categorical DQN.')
    arg_alg.add_argument("--n-steps-return",
                         type=int,
                         default=3,
                         help='Number of steps for n-step return for Rainbow.')
    arg_alg.add_argument("--sigma-coeff",
                         type=float,
                         default=.5,
                         help='Sigma0 coefficient for noise initialization in'
                         'NoisyDQN and Rainbow.')

    arg_utils = parser.add_argument_group('Utils')
    arg_utils.add_argument('--use-cuda',
                           action='store_true',
                           help='Flag specifying whether to use the GPU.')
    arg_utils.add_argument('--save',
                           action='store_true',
                           help='Flag specifying whether to save the model.')
    arg_utils.add_argument('--load-path',
                           type=str,
                           help='Path of the model to be loaded.')
    arg_utils.add_argument('--render',
                           action='store_true',
                           help='Flag specifying whether to render the grid.')
    arg_utils.add_argument('--quiet',
                           action='store_true',
                           help='Flag specifying whether to hide the progress'
                           'bar.')
    arg_utils.add_argument('--debug',
                           action='store_true',
                           help='Flag specifying whether the script has to be'
                           'run in debug mode.')

    args = parser.parse_args()

    scores = list()

    optimizer = dict()
    if args.optimizer == 'adam':
        optimizer['class'] = optim.Adam
        optimizer['params'] = dict(lr=args.learning_rate, eps=args.epsilon)
    elif args.optimizer == 'adadelta':
        optimizer['class'] = optim.Adadelta
        optimizer['params'] = dict(lr=args.learning_rate, eps=args.epsilon)
    elif args.optimizer == 'rmsprop':
        optimizer['class'] = optim.RMSprop
        optimizer['params'] = dict(lr=args.learning_rate,
                                   alpha=args.decay,
                                   eps=args.epsilon)
    elif args.optimizer == 'rmspropcentered':
        optimizer['class'] = optim.RMSprop
        optimizer['params'] = dict(lr=args.learning_rate,
                                   alpha=args.decay,
                                   eps=args.epsilon,
                                   centered=True)
    else:
        raise ValueError

    # Summary folder
    folder_name = './logs/habitat_nav_' + args.algorithm +\
        '_' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    pathlib.Path(folder_name).mkdir(parents=True)

    # Settings
    if args.debug:
        initial_replay_size = 50
        max_replay_size = 500
        train_frequency = 5
        target_update_frequency = 10
        test_episodes = 20
        evaluation_frequency = 50
        max_steps = 1000
    else:
        initial_replay_size = args.initial_replay_size
        max_replay_size = args.max_replay_size
        train_frequency = args.train_frequency
        target_update_frequency = args.target_update_frequency
        test_episodes = args.test_episodes
        evaluation_frequency = args.evaluation_frequency
        max_steps = args.max_steps

    # MDP
    config_file = os.path.join(
        pathlib.Path(__file__).parent.resolve(),
        'pointnav_apartment-0.yaml')  # Custom task for Replica scenes
    wrapper = 'HabitatNavigationWrapper'
    mdp = Habitat(wrapper, config_file)
    opt_return = mdp.env.get_optimal_policy_return()

    if args.load_path:
        logger = Logger(DQN.__name__, results_dir=None)
        logger.strong_line()
        logger.info('Optimal Policy Undiscounted Return: ' + str(opt_return))
        logger.info('Experiment Algorithm: ' + DQN.__name__)

        # Agent
        agent = DQN.load(args.load_path)
        epsilon_test = Parameter(value=args.test_exploration_rate)
        agent.policy.set_epsilon(epsilon_test)

        # Algorithm
        core_test = Core(agent, mdp)

        # Evaluate model
        dataset = core_test.evaluate(n_episodes=args.test_episodes,
                                     render=args.render,
                                     quiet=args.quiet)
        get_stats(dataset, logger)
    else:
        # Policy
        epsilon = LinearParameter(value=args.initial_exploration_rate,
                                  threshold_value=args.final_exploration_rate,
                                  n=args.final_exploration_frame)
        epsilon_test = Parameter(value=args.test_exploration_rate)
        epsilon_random = Parameter(value=1)
        pi = EpsGreedy(epsilon=epsilon_random)

        # Approximator
        approximator_params = dict(
            network=Network if args.algorithm
            not in ['dueldqn', 'cdqn', 'ndqn', 'rainbow'] else FeatureNetwork,
            input_shape=mdp.info.observation_space.shape,
            output_shape=(mdp.info.action_space.n, ),
            n_actions=mdp.info.action_space.n,
            n_features=Network.n_features,
            optimizer=optimizer,
            use_cuda=args.use_cuda)
        if args.algorithm not in ['cdqn', 'rainbow']:
            approximator_params['loss'] = F.smooth_l1_loss

        approximator = TorchApproximator

        if args.prioritized:
            replay_memory = PrioritizedReplayMemory(
                initial_replay_size,
                max_replay_size,
                alpha=args.alpha_coeff,
                beta=LinearParameter(.4,
                                     threshold_value=1,
                                     n=max_steps // train_frequency))
        else:
            replay_memory = None

        # Agent
        algorithm_params = dict(
            batch_size=args.batch_size,
            target_update_frequency=target_update_frequency // train_frequency,
            replay_memory=replay_memory,
            initial_replay_size=initial_replay_size,
            max_replay_size=max_replay_size)

        if args.algorithm == 'dqn':
            alg = DQN
            agent = alg(mdp.info,
                        pi,
                        approximator,
                        approximator_params=approximator_params,
                        **algorithm_params)
        elif args.algorithm == 'ddqn':
            alg = DoubleDQN
            agent = alg(mdp.info,
                        pi,
                        approximator,
                        approximator_params=approximator_params,
                        **algorithm_params)
        elif args.algorithm == 'adqn':
            alg = AveragedDQN
            agent = alg(mdp.info,
                        pi,
                        approximator,
                        approximator_params=approximator_params,
                        n_approximators=args.n_approximators,
                        **algorithm_params)
        elif args.algorithm == 'mmdqn':
            alg = MaxminDQN
            agent = alg(mdp.info,
                        pi,
                        approximator,
                        approximator_params=approximator_params,
                        n_approximators=args.n_approximators,
                        **algorithm_params)
        elif args.algorithm == 'dueldqn':
            alg = DuelingDQN
            agent = alg(mdp.info,
                        pi,
                        approximator_params=approximator_params,
                        **algorithm_params)
        elif args.algorithm == 'cdqn':
            alg = CategoricalDQN
            agent = alg(mdp.info,
                        pi,
                        approximator_params=approximator_params,
                        n_atoms=args.n_atoms,
                        v_min=args.v_min,
                        v_max=args.v_max,
                        **algorithm_params)
        elif args.algorithm == 'ndqn':
            alg = NoisyDQN
            agent = alg(mdp.info,
                        pi,
                        approximator_params=approximator_params,
                        sigma_coeff=args.sigma_coeff,
                        **algorithm_params)
        elif args.algorithm == 'rainbow':
            alg = Rainbow
            beta = LinearParameter(.4,
                                   threshold_value=1,
                                   n=max_steps // train_frequency)
            agent = alg(mdp.info,
                        pi,
                        approximator_params=approximator_params,
                        n_atoms=args.n_atoms,
                        v_min=args.v_min,
                        v_max=args.v_max,
                        n_steps_return=args.n_steps_return,
                        alpha_coeff=args.alpha_coeff,
                        beta=beta,
                        sigma_coeff=args.sigma_coeff,
                        **algorithm_params)

        logger = Logger(alg.__name__, results_dir=None)
        logger.strong_line()
        logger.info('Optimal Policy Undiscounted Return: ' + str(opt_return))
        logger.info('Experiment Algorithm: ' + alg.__name__)

        # Algorithm
        core = Core(agent, mdp)

        # RUN

        # Fill replay memory with random dataset
        print_epoch(0, logger)
        core.learn(n_steps=initial_replay_size,
                   n_steps_per_fit=initial_replay_size,
                   quiet=args.quiet)

        if args.save:
            agent.save(folder_name + '/agent_0.msh')

        # Evaluate initial policy
        pi.set_epsilon(epsilon_test)
        dataset = core.evaluate(n_episodes=test_episodes,
                                render=args.render,
                                quiet=args.quiet)
        scores.append(get_stats(dataset, logger))

        np.save(folder_name + '/scores.npy', scores)
        for n_epoch in range(1, max_steps // evaluation_frequency + 1):
            print_epoch(n_epoch, logger)
            logger.info('- Learning:')
            # learning step
            pi.set_epsilon(epsilon)
            core.learn(n_steps=evaluation_frequency,
                       n_steps_per_fit=train_frequency,
                       quiet=args.quiet)

            if args.save:
                agent.save(folder_name + '/agent_' + str(n_epoch) + '.msh')

            logger.info('- Evaluation:')
            # evaluation step
            pi.set_epsilon(epsilon_test)
            dataset = core.evaluate(n_episodes=test_episodes,
                                    render=args.render,
                                    quiet=args.quiet)
            scores.append(get_stats(dataset, logger))

            np.save(folder_name + '/scores.npy', scores)

    return scores
def experiment(mdp, params, prob=None):
    # Argument parser
    # parser = argparse.ArgumentParser()
    #
    # args = parser.parse_args()

    scores = list()

    optimizer = dict()
    if params['optimizer'] == 'adam':
        optimizer['class'] = optim.Adam
        optimizer['params'] = dict(lr=params['learning_rate'])
    elif params['optimizer'] == 'adadelta':
        optimizer['class'] = optim.Adadelta
        optimizer['params'] = dict(lr=params['learning_rate'])
    elif params['optimizer'] == 'rmsprop':
        optimizer['class'] = optim.RMSprop
        optimizer['params'] = dict(lr=params['learning_rate'],
                                   alpha=params['decay'])
    elif params['optimizer'] == 'rmspropcentered':
        optimizer['class'] = optim.RMSprop
        optimizer['params'] = dict(lr=params['learning_rate'],
                                   alpha=params['decay'],
                                   centered=True)
    else:
        raise ValueError

    # DQN learning run

    # Summary folder
    folder_name = os.path.join(PROJECT_DIR, 'logs', params['name'])
    if params['save']:
        pathlib.Path(folder_name).mkdir(parents=True)

    # Policy
    epsilon = ExponentialParameter(value=params['initial_exploration_rate'],
                                   exp=params['exploration_rate'],
                                   min_value=params['final_exploration_rate'],
                                   size=(1, ))

    epsilon_random = Parameter(value=1)
    epsilon_test = Parameter(value=0.01)
    pi = EpsGreedy(epsilon=epsilon_random)

    class CategoricalLoss(nn.Module):
        def forward(self, input, target):
            input = input.clamp(1e-5)

            return -torch.sum(target * torch.log(input))

    # Approximator
    input_shape = mdp.observation.shape

    resources = [[
        (mdp.north - mdp.devices[device][0]) / (mdp.north - mdp.south),
        (mdp.east - mdp.devices[device][1]) / (mdp.east - mdp.west)
    ] for device in mdp.device_ordering]
    edges = [[
        (mdp.north - mdp.graph.nodes[e[0]]['y']) / (mdp.north - mdp.south),
        (mdp.east - mdp.graph.nodes[e[0]]['x']) / (mdp.east - mdp.west),
        (mdp.north - mdp.graph.nodes[e[1]]['y']) / (mdp.north - mdp.south),
        (mdp.east - mdp.graph.nodes[e[1]]['x']) / (mdp.east - mdp.west)
    ] for e in mdp.graph.edges]

    N = {
        'SimpleResourceNetwork': SimpleResourceNetwork,
        'GraphConvolutionResourceNetwork': GraphConvolutionResourceNetwork,
    }[params['network']]
    N.n_features = params['hidden']

    approximator_params = dict(
        network=N,
        input_shape=input_shape,
        edges=edges,
        resources=resources,
        graph=mdp.graph,
        allow_wait=params['allow_wait'],
        long_term_q=params['long_term_q'],
        resource_embeddings=params['resource_embeddings'],
        edge_ordering=mdp.edge_ordering,
        device_ordering=mdp.device_ordering,
        resource_edges=mdp.resource_edges,
        output_shape=(mdp.info.action_space.n, ),
        n_actions=mdp.info.action_space.n,
        n_features=params['hidden'],
        optimizer=optimizer,
        loss=F.smooth_l1_loss,
        nn_scaling=params['nn_scaling'],
        # quiet=False,
        use_cuda=params['cuda'],
        load_path=params.get('load_path', None))

    approximator = TorchApproximator

    replay_memory = PrioritizedReplayMemory(
        params['initial_replay_size'],
        params['max_replay_size'],
        alpha=.6,
        beta=LinearParameter(.4,
                             threshold_value=1,
                             n=params['max_steps'] //
                             params['train_frequency']))

    # Agent
    algorithm_params = dict(
        batch_size=params['batch_size'],
        n_approximators=1,
        target_update_frequency=params['target_update_frequency'] //
        params['train_frequency'],
        replay_memory=replay_memory,
        initial_replay_size=params['initial_replay_size'],
        max_replay_size=params['max_replay_size'])

    clz = DoubleDQN if mdp.info.gamma >= 1 else SMDPDQN
    agent = clz(mdp.info,
                pi,
                approximator,
                approximator_params=approximator_params,
                **algorithm_params)

    lr_scheduler = torch.optim.lr_scheduler.StepLR(
        agent.approximator._impl.model._optimizer,
        step_size=1,
        gamma=params['lr_decay'],
        last_epoch=-1)  # params['max_steps'] // params['train_frequency']

    # Algorithm
    core = Core(agent, mdp)

    if 'weights' in params:
        best_weights = np.load(params['weights'])
        agent.approximator.set_weights(best_weights)
        agent.target_approximator.set_weights(best_weights)
    else:
        best_weights = agent.approximator.get_weights()

    # RUN
    pi.set_epsilon(epsilon_test)
    eval_days = [i for i in range(1, 356) if i % 13 == 1]
    ds = core.evaluate(initial_states=eval_days,
                       quiet=tuning,
                       render=params['save'])
    test_result = np.mean(compute_J(ds))
    test_result_discounted = np.mean(compute_J(ds, params['gamma']))
    print("discounted validation result", test_result_discounted)
    print("validation result", test_result)
    results = [(0, 0, test_result_discounted, test_result)]
    if params['save']:
        mdp.save_rendered(folder_name + "/epoch_init.mp4")

    # Fill replay memory with random dataset
    print_epoch(0)
    start = time()
    core.learn(n_steps=params['initial_replay_size'],
               n_steps_per_fit=params['initial_replay_size'],
               quiet=tuning)

    runtime = time() - start
    steps = 0

    if params['save']:
        with open(folder_name + "/params.json", "w") as f:
            json.dump(params, f, indent=4)
        if isinstance(agent, DQN):
            np.save(folder_name + '/weights-exp-0-0.npy',
                    agent.approximator.get_weights())

    best_score = -np.inf
    no_improvement = 0
    patience = 6

    if params['save']:
        np.save(folder_name + '/scores.npy', scores)
    for n_epoch in range(
            1, int(params['max_steps'] // params['evaluation_frequency'] + 1)):
        print_epoch(n_epoch)
        print('- Learning:')
        # learning step
        pi.set_epsilon(epsilon)
        # mdp.set_episode_end(True)
        start = time()
        core.learn(n_steps=params['evaluation_frequency'],
                   n_steps_per_fit=params['train_frequency'],
                   quiet=tuning)
        runtime += time() - start
        steps += params['evaluation_frequency']
        lr_scheduler.step()

        if params['save']:
            if isinstance(agent, DQN):
                np.save(
                    folder_name + '/weights-exp-0-' + str(n_epoch) + '.npy',
                    agent.approximator.get_weights())

        print('- Evaluation:')
        # evaluation step
        pi.set_epsilon(epsilon_test)
        ds = core.evaluate(initial_states=eval_days,
                           render=params['save'],
                           quiet=tuning)
        test_result_discounted = np.mean(compute_J(ds, params['gamma']))
        test_result = np.mean(compute_J(ds))
        print("discounted validation result", test_result_discounted)
        print("validation result", test_result)

        if params['save']:
            mdp.save_rendered(folder_name + ("/epoch%04d.mp4" % n_epoch))
        results.append((runtime, steps, test_result_discounted, test_result))

        if params['save']:
            np.savetxt(folder_name + '/scores.csv',
                       np.asarray(results),
                       delimiter=',')

        if test_result > best_score:
            no_improvement = 0
            best_score = test_result
            best_weights = agent.approximator.get_weights().copy()

            with open(folder_name + "/best_val.txt", "w") as f:
                f.write("%f" % test_result)
        else:
            no_improvement += 1
            if no_improvement >= patience:
                break

    print('---------- FINAL EVALUATION ---------')
    agent.approximator.set_weights(best_weights)
    agent.target_approximator.set_weights(best_weights)
    pi.set_epsilon(epsilon_test)
    eval_days = [i for i in range(1, 356) if i % 13 == 0]
    ds = core.evaluate(initial_states=eval_days,
                       render=params['save'],
                       quiet=tuning)
    test_result_discounted = np.mean(compute_J(ds, params['gamma']))
    test_result = np.mean(compute_J(ds))
    print("discounted test result", test_result_discounted)
    print("test result", test_result)

    with open(folder_name + "/test_result.txt", "w") as f:
        f.write("%f" % test_result)

    if params['save']:
        mdp.save_rendered(folder_name + "/epoch_test.mp4", 10000)

    return scores