def test_eps_greedy():
    np.random.seed(88)
    eps = Parameter(0.1)
    pi = EpsGreedy(eps)

    Q = Table((10, 3))
    Q.table = np.random.randn(10, 3)

    pi.set_q(Q)

    s = np.array([2])
    a = np.array([1])

    p_s = pi(s)
    p_s_test = np.array([0.03333333, 0.93333333, 0.03333333])
    assert np.allclose(p_s, p_s_test)

    p_sa = pi(s, a)
    p_sa_test = np.array([0.93333333])
    assert np.allclose(p_sa, p_sa_test)

    a = pi.draw_action(s)
    a_test = 1
    assert a.item() == a_test

    eps_2 = LinearParameter(0.2, 0.1, 2)
    pi.set_epsilon(eps_2)
    p_sa_2 = pi(s, a)
    assert p_sa_2 < p_sa

    pi.update(s, a)
    pi.update(s, a)
    p_sa_3 = pi(s, a)
    print(eps_2.get_value())
    assert p_sa_3 == p_sa
Exemple #2
0
def test_prioritized_dqn_save(tmpdir):
    agent_path = tmpdir / 'agent_{}'.format(
        datetime.now().strftime("%H%M%S%f"))

    replay_memory = PrioritizedReplayMemory(50,
                                            500,
                                            alpha=.6,
                                            beta=LinearParameter(
                                                .4,
                                                threshold_value=1,
                                                n=500 // 5))
    params = dict(batch_size=50,
                  initial_replay_size=50,
                  max_replay_size=500,
                  target_update_frequency=50,
                  replay_memory=replay_memory)
    agent_save = learn(DQN, params)

    agent_save.save(agent_path, full_save=True)
    agent_load = Agent.load(agent_path)

    for att, method in vars(agent_save).items():
        save_attr = getattr(agent_save, att)
        load_attr = getattr(agent_load, att)

        tu.assert_eq(save_attr, load_attr)
Exemple #3
0
def test_prioritized_dqn_save():
    replay_memory = PrioritizedReplayMemory(50,
                                            500,
                                            alpha=.6,
                                            beta=LinearParameter(
                                                .4,
                                                threshold_value=1,
                                                n=500 // 5))
    params = dict(batch_size=50,
                  n_approximators=1,
                  initial_replay_size=50,
                  max_replay_size=500,
                  target_update_frequency=50,
                  replay_memory=replay_memory)
    agent_save = learn(DQN, params)

    agent_path = './agentdir{}/'.format(datetime.now().strftime("%H%M%S%f"))

    agent_save.save(agent_path)
    agent_load = Agent.load(agent_path)

    shutil.rmtree(agent_path)

    for att, method in agent_save.__dict__.items():
        save_attr = getattr(agent_save, att)
        load_attr = getattr(agent_load, att)
        #print('{}: {}'.format(att, type(save_attr)))

        tu.assert_eq(save_attr, load_attr)
def test_boltzmann():
    np.random.seed(88)
    beta = Parameter(0.1)
    pi = Boltzmann(beta)

    Q = Table((10, 3))
    Q.table = np.random.randn(10, 3)

    pi.set_q(Q)

    s = np.array([2])
    a = np.array([1])

    p_s = pi(s)
    p_s_test = np.array([0.30676679, 0.36223227, 0.33100094])
    assert np.allclose(p_s, p_s_test)

    p_sa = pi(s, a)
    p_sa_test = np.array([0.36223227])
    assert np.allclose(p_sa, p_sa_test)

    a = pi.draw_action(s)
    a_test = 2
    assert a.item() == a_test

    beta_2 = LinearParameter(0.2, 0.1, 2)
    pi.set_beta(beta_2)
    p_sa_2 = pi(s, a)
    assert p_sa_2 < p_sa

    pi.update(s, a)
    p_sa_3 = pi(s, a)
    p_sa_3_test = np.array([0.33100094])
    assert np.allclose(p_sa_3, p_sa_3_test)
Exemple #5
0
    def build(self, mdp_info):
        self.approximator_params[
            'input_shape'] = mdp_info.observation_space.shape
        self.approximator_params['output_shape'] = (mdp_info.action_space.n, )
        self.approximator_params['n_actions'] = mdp_info.action_space.n

        replay_memory = PrioritizedReplayMemory(
            self.alg_params['initial_replay_size'],
            self.alg_params['max_replay_size'],
            alpha=.6,
            beta=LinearParameter(.4, threshold_value=1, n=50000000 // 4))
        self.alg_params['replay_memory'] = replay_memory
        self.epsilon = LinearParameter(value=1, threshold_value=.05, n=1000000)
        self.epsilon_test = Parameter(value=.01)

        return DQN(mdp_info, self.policy, self.approximator,
                   self.approximator_params, **self.alg_params)
Exemple #6
0
    def build(self, mdp_info):
        self.approximator_params[
            'input_shape'] = mdp_info.observation_space.shape
        self.approximator_params['output_shape'] = (mdp_info.action_space.n, )
        self.approximator_params['n_actions'] = mdp_info.action_space.n
        self.epsilon = LinearParameter(value=1, threshold_value=.05, n=1000000)
        self.epsilon_test = Parameter(value=.01)

        return DoubleDQN(mdp_info, self.policy, self.approximator,
                         self.approximator_params, **self.alg_params)
Exemple #7
0
def test_prioritized_dqn():
    replay_memory = PrioritizedReplayMemory(
        50, 500, alpha=.6,
        beta=LinearParameter(.4, threshold_value=1, n=500 // 5)
    )
    params = dict(batch_size=50, initial_replay_size=50,
                  max_replay_size=500, target_update_frequency=50,
                  replay_memory=replay_memory)
    approximator = learn(DQN, params).approximator

    w = approximator.get_weights()
    w_test = np.array([-0.2410347, 0.39138362, 0.12457055, 0.60612524, -0.54973847,
                       -0.06486652, -0.07349031, 0.4376623, 0.14254288])

    assert np.allclose(w, w_test)
Exemple #8
0
def test_prioritized_dqn():
    replay_memory = PrioritizedReplayMemory(
        50, 500, alpha=.6,
        beta=LinearParameter(.4, threshold_value=1, n=500 // 5)
    )
    params = dict(batch_size=50, n_approximators=1, initial_replay_size=50,
                  max_replay_size=500, target_update_frequency=50,
                  replay_memory=replay_memory)
    approximator = learn(DQN, params)

    w = approximator.get_weights()
    w_test = np.array([-0.1384063, 0.48957556, 0.02254359, 0.50994426,
                       -0.56277484, -0.075808, -0.06829552, 0.3642576,
                       0.15519235])

    assert np.allclose(w, w_test)
Exemple #9
0
def test_rainbow_save(tmpdir):
    agent_path = tmpdir / 'agent_{}'.format(datetime.now().strftime("%H%M%S%f"))

    params = dict(batch_size=50, initial_replay_size=50,
                  max_replay_size=500, target_update_frequency=50, n_steps_return=1,
                  alpha_coeff=.6, beta=LinearParameter(.4, threshold_value=1, n=500 // 5))
    agent_save = learn(Rainbow, params)

    agent_save.save(agent_path, full_save=True)
    agent_load = Agent.load(agent_path)

    for att, method in vars(agent_save).items():
        save_attr = getattr(agent_save, att)
        load_attr = getattr(agent_load, att)

        tu.assert_eq(save_attr, load_attr)
Exemple #10
0
def test_rainbow():
    params = dict(batch_size=50, initial_replay_size=50,
                  max_replay_size=500, target_update_frequency=50, n_steps_return=3,
                  alpha_coeff=.6, beta=LinearParameter(.4, threshold_value=1, n=500 // 5))
    approximator = learn(Rainbow, params).approximator

    w = approximator.get_weights()
    w_test = np.array([0.41471523, -0.24608319, -0.18744999, 0.26587564, 0.39882535, 0.412821,
                       0.30898723, 0.29745516, -0.5973996, 0.35576734, 0.41858765, 0.2911771,
                       -0.09666843, 0.32220146, 0.04949852, -0.04904625, 0.3972141, 0.32487455,
                       0.3105287, 0.38326296, 0.15647355, 0.07453305, 0.31577617, 0.38884395,
                       0.30908346, -0.20951316, -0.1023823, -0.12970605, 0.40118366, 0.41426662,
                       0.30691648, 0.2924496, 0.08292492, 0.01674112, 0.33560023, 0.3732411,
                       0.5594649, 0.17095159, -0.20466673, -0.37797216, 0.29877642, 0.3118145,
                       0.40977645, 0.39796302, -0.0712048, -0.35232118, 0.40097338, 0.3074576])

    assert np.allclose(w, w_test, rtol=1e-4)
Exemple #11
0
def experiment():
    np.random.seed()

    # Argument parser
    parser = argparse.ArgumentParser()

    arg_game = parser.add_argument_group('Game')
    arg_game.add_argument("--name",
                          type=str,
                          default='BreakoutDeterministic-v4',
                          help='Gym ID of the Atari game.')
    arg_game.add_argument("--screen-width",
                          type=int,
                          default=84,
                          help='Width of the game screen.')
    arg_game.add_argument("--screen-height",
                          type=int,
                          default=84,
                          help='Height of the game screen.')

    arg_mem = parser.add_argument_group('Replay Memory')
    arg_mem.add_argument("--initial-replay-size",
                         type=int,
                         default=50000,
                         help='Initial size of the replay memory.')
    arg_mem.add_argument("--max-replay-size",
                         type=int,
                         default=500000,
                         help='Max size of the replay memory.')
    arg_mem.add_argument("--prioritized",
                         action='store_true',
                         help='Whether to use prioritized memory or not.')

    arg_net = parser.add_argument_group('Deep Q-Network')
    arg_net.add_argument(
        "--optimizer",
        choices=['adadelta', 'adam', 'rmsprop', 'rmspropcentered'],
        default='rmsprop',
        help='Name of the optimizer to use.')
    arg_net.add_argument("--learning-rate",
                         type=float,
                         default=.00025,
                         help='Learning rate value of the optimizer.')
    arg_net.add_argument("--decay",
                         type=float,
                         default=.95,
                         help='Discount factor for the history coming from the'
                         'gradient momentum in rmspropcentered and'
                         'rmsprop')
    arg_net.add_argument("--epsilon",
                         type=float,
                         default=1e-8,
                         help='Epsilon term used in rmspropcentered and'
                         'rmsprop')

    arg_alg = parser.add_argument_group('Algorithm')
    arg_alg.add_argument("--algorithm",
                         choices=['dqn', 'ddqn', 'adqn', 'mmdqn', 'cdqn'],
                         default='dqn',
                         help='Name of the algorithm. dqn is for standard'
                         'DQN, ddqn is for Double DQN and adqn is for'
                         'Averaged DQN.')
    arg_alg.add_argument(
        "--n-approximators",
        type=int,
        default=1,
        help="Number of approximators used in the ensemble for"
        "AveragedDQN or MaxminDQN.")
    arg_alg.add_argument("--batch-size",
                         type=int,
                         default=32,
                         help='Batch size for each fit of the network.')
    arg_alg.add_argument("--history-length",
                         type=int,
                         default=4,
                         help='Number of frames composing a state.')
    arg_alg.add_argument("--target-update-frequency",
                         type=int,
                         default=10000,
                         help='Number of collected samples before each update'
                         'of the target network.')
    arg_alg.add_argument("--evaluation-frequency",
                         type=int,
                         default=250000,
                         help='Number of collected samples before each'
                         'evaluation. An epoch ends after this number of'
                         'steps')
    arg_alg.add_argument("--train-frequency",
                         type=int,
                         default=4,
                         help='Number of collected samples before each fit of'
                         'the neural network.')
    arg_alg.add_argument("--max-steps",
                         type=int,
                         default=50000000,
                         help='Total number of collected samples.')
    arg_alg.add_argument(
        "--final-exploration-frame",
        type=int,
        default=1000000,
        help='Number of collected samples until the exploration'
        'rate stops decreasing.')
    arg_alg.add_argument("--initial-exploration-rate",
                         type=float,
                         default=1.,
                         help='Initial value of the exploration rate.')
    arg_alg.add_argument("--final-exploration-rate",
                         type=float,
                         default=.1,
                         help='Final value of the exploration rate. When it'
                         'reaches this values, it stays constant.')
    arg_alg.add_argument("--test-exploration-rate",
                         type=float,
                         default=.05,
                         help='Exploration rate used during evaluation.')
    arg_alg.add_argument("--test-samples",
                         type=int,
                         default=125000,
                         help='Number of collected samples for each'
                         'evaluation.')
    arg_alg.add_argument(
        "--max-no-op-actions",
        type=int,
        default=30,
        help='Maximum number of no-op actions performed at the'
        'beginning of the episodes.')
    arg_alg.add_argument("--n-atoms",
                         type=int,
                         default=51,
                         help='Number of atoms for Categorical DQN.')
    arg_alg.add_argument("--v-min",
                         type=int,
                         default=-10,
                         help='Minimum action-value for Categorical DQN.')
    arg_alg.add_argument("--v-max",
                         type=int,
                         default=10,
                         help='Maximum action-value for Categorical DQN.')

    arg_utils = parser.add_argument_group('Utils')
    arg_utils.add_argument('--use-cuda',
                           action='store_true',
                           help='Flag specifying whether to use the GPU.')
    arg_utils.add_argument('--save',
                           action='store_true',
                           help='Flag specifying whether to save the model.')
    arg_utils.add_argument('--load-path',
                           type=str,
                           help='Path of the model to be loaded.')
    arg_utils.add_argument('--render',
                           action='store_true',
                           help='Flag specifying whether to render the game.')
    arg_utils.add_argument('--quiet',
                           action='store_true',
                           help='Flag specifying whether to hide the progress'
                           'bar.')
    arg_utils.add_argument('--debug',
                           action='store_true',
                           help='Flag specifying whether the script has to be'
                           'run in debug mode.')

    args = parser.parse_args()

    scores = list()

    optimizer = dict()
    if args.optimizer == 'adam':
        optimizer['class'] = optim.Adam
        optimizer['params'] = dict(lr=args.learning_rate, eps=args.epsilon)
    elif args.optimizer == 'adadelta':
        optimizer['class'] = optim.Adadelta
        optimizer['params'] = dict(lr=args.learning_rate, eps=args.epsilon)
    elif args.optimizer == 'rmsprop':
        optimizer['class'] = optim.RMSprop
        optimizer['params'] = dict(lr=args.learning_rate,
                                   alpha=args.decay,
                                   eps=args.epsilon)
    elif args.optimizer == 'rmspropcentered':
        optimizer['class'] = optim.RMSprop
        optimizer['params'] = dict(lr=args.learning_rate,
                                   alpha=args.decay,
                                   eps=args.epsilon,
                                   centered=True)
    else:
        raise ValueError

    # Summary folder
    folder_name = './logs/atari_' + args.algorithm + '_' + args.name +\
        '_' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    pathlib.Path(folder_name).mkdir(parents=True)

    # Settings
    if args.debug:
        initial_replay_size = 50
        max_replay_size = 500
        train_frequency = 5
        target_update_frequency = 10
        test_samples = 20
        evaluation_frequency = 50
        max_steps = 1000
    else:
        initial_replay_size = args.initial_replay_size
        max_replay_size = args.max_replay_size
        train_frequency = args.train_frequency
        target_update_frequency = args.target_update_frequency
        test_samples = args.test_samples
        evaluation_frequency = args.evaluation_frequency
        max_steps = args.max_steps

    # MDP
    mdp = Atari(args.name,
                args.screen_width,
                args.screen_height,
                ends_at_life=True,
                history_length=args.history_length,
                max_no_op_actions=args.max_no_op_actions)

    if args.load_path:
        # Agent
        agent = DQN.load(args.load_path)
        epsilon_test = Parameter(value=args.test_exploration_rate)
        agent.policy.set_epsilon(epsilon_test)

        # Algorithm
        core_test = Core(agent, mdp)

        # Evaluate model
        dataset = core_test.evaluate(n_steps=args.test_samples,
                                     render=args.render,
                                     quiet=args.quiet)
        get_stats(dataset)

    else:
        # Policy
        epsilon = LinearParameter(value=args.initial_exploration_rate,
                                  threshold_value=args.final_exploration_rate,
                                  n=args.final_exploration_frame)
        epsilon_test = Parameter(value=args.test_exploration_rate)
        epsilon_random = Parameter(value=1)
        pi = EpsGreedy(epsilon=epsilon_random)

        class CategoricalLoss(nn.Module):
            def forward(self, input, target):
                input = input.clamp(1e-5)

                return -torch.sum(target * torch.log(input))

        # Approximator
        approximator_params = dict(
            network=Network if args.algorithm != 'cdqn' else FeatureNetwork,
            input_shape=mdp.info.observation_space.shape,
            output_shape=(mdp.info.action_space.n, ),
            n_actions=mdp.info.action_space.n,
            n_features=Network.n_features,
            optimizer=optimizer,
            loss=F.smooth_l1_loss
            if args.algorithm != 'cdqn' else CategoricalLoss(),
            use_cuda=args.use_cuda)

        approximator = TorchApproximator

        if args.prioritized:
            replay_memory = PrioritizedReplayMemory(
                initial_replay_size,
                max_replay_size,
                alpha=.6,
                beta=LinearParameter(.4,
                                     threshold_value=1,
                                     n=max_steps // train_frequency))
        else:
            replay_memory = None

        # Agent
        algorithm_params = dict(
            batch_size=args.batch_size,
            target_update_frequency=target_update_frequency // train_frequency,
            replay_memory=replay_memory,
            initial_replay_size=initial_replay_size,
            max_replay_size=max_replay_size)

        if args.algorithm == 'dqn':
            agent = DQN(mdp.info,
                        pi,
                        approximator,
                        approximator_params=approximator_params,
                        **algorithm_params)
        elif args.algorithm == 'ddqn':
            agent = DoubleDQN(mdp.info,
                              pi,
                              approximator,
                              approximator_params=approximator_params,
                              **algorithm_params)
        elif args.algorithm == 'adqn':
            agent = AveragedDQN(mdp.info,
                                pi,
                                approximator,
                                approximator_params=approximator_params,
                                n_approximators=args.n_approximators,
                                **algorithm_params)
        elif args.algorithm == 'mmdqn':
            agent = MaxminDQN(mdp.info,
                              pi,
                              approximator,
                              approximator_params=approximator_params,
                              n_approximators=args.n_approximators,
                              **algorithm_params)
        elif args.algorithm == 'cdqn':
            agent = CategoricalDQN(mdp.info,
                                   pi,
                                   approximator_params=approximator_params,
                                   n_atoms=args.n_atoms,
                                   v_min=args.v_min,
                                   v_max=args.v_max,
                                   **algorithm_params)

        # Algorithm
        core = Core(agent, mdp)

        # RUN

        # Fill replay memory with random dataset
        print_epoch(0)
        core.learn(n_steps=initial_replay_size,
                   n_steps_per_fit=initial_replay_size,
                   quiet=args.quiet)

        if args.save:
            agent.save(folder_name + '/agent_0.msh')

        # Evaluate initial policy
        pi.set_epsilon(epsilon_test)
        mdp.set_episode_end(False)
        dataset = core.evaluate(n_steps=test_samples,
                                render=args.render,
                                quiet=args.quiet)
        scores.append(get_stats(dataset))

        np.save(folder_name + '/scores.npy', scores)
        for n_epoch in range(1, max_steps // evaluation_frequency + 1):
            print_epoch(n_epoch)
            print('- Learning:')
            # learning step
            pi.set_epsilon(epsilon)
            mdp.set_episode_end(True)
            core.learn(n_steps=evaluation_frequency,
                       n_steps_per_fit=train_frequency,
                       quiet=args.quiet)

            if args.save:
                agent.save(folder_name + '/agent_' + str(n_epoch) + '.msh')

            print('- Evaluation:')
            # evaluation step
            pi.set_epsilon(epsilon_test)
            mdp.set_episode_end(False)
            dataset = core.evaluate(n_steps=test_samples,
                                    render=args.render,
                                    quiet=args.quiet)
            scores.append(get_stats(dataset))

            np.save(folder_name + '/scores.npy', scores)

    return scores
Exemple #12
0
train_frequency = 4
evaluation_frequency = 5000
target_update_frequency = 500
initial_replay_size = 5000
max_replay_size = 50000
test_samples = 100
max_steps = 200000
test_episodes = 50


# PO-MDP
mdp = ShowdownEnvironment()

# Policy
epsilon = LinearParameter(value=1.,
                          threshold_value=.1,
                          n=1000000)
epsilon_test = Parameter(value=.05)
epsilon_random = Parameter(value=1)
pi = EpsGreedy(epsilon=epsilon_random)

# Approximator
approximator_params = dict(
    network=Network,
    input_shape=(110,),
    output_shape=(13,),
    n_actions=13,
    optimizer=optimizer,
    loss=F.mse_loss
)
Exemple #13
0
def experiment(args, idx):
    np.random.seed()

    args.games = [''.join(g) for g in args.games]

    # MDP
    mdp = list()
    for i, g in enumerate(args.games):
        mdp.append(Atari(g))

    n_actions_per_head = [(m.info.action_space.n,) for m in mdp]

    max_obs_dim = 0
    max_act_n = 0
    for i in range(len(args.games)):
        n = mdp[i].info.observation_space.shape[0]
        m = mdp[i].info.action_space.n
        if n > max_obs_dim:
            max_obs_dim = n
            max_obs_idx = i
        if m > max_act_n:
            max_act_n = m
            max_act_idx = i
    gammas = [m.info.gamma for m in mdp]
    horizons = [m.info.horizon for m in mdp]
    mdp_info = MDPInfo(mdp[max_obs_idx].info.observation_space,
                       mdp[max_act_idx].info.action_space, gammas, horizons)

    scores = list()
    for _ in range(len(args.games)):
        scores.append(list())

    optimizer = dict()
    if args.optimizer == 'adam':
        optimizer['class'] = optim.Adam
        optimizer['params'] = dict(lr=args.learning_rate,
                                   eps=args.epsilon)
    elif args.optimizer == 'adadelta':
        optimizer['class'] = optim.Adadelta
        optimizer['params'] = dict(lr=args.learning_rate,
                                   eps=args.epsilon)
    elif args.optimizer == 'rmsprop':
        optimizer['class'] = optim.RMSprop
        optimizer['params'] = dict(lr=args.learning_rate,
                                   alpha=args.decay,
                                   eps=args.epsilon)
    elif args.optimizer == 'rmspropcentered':
        optimizer['class'] = optim.RMSprop
        optimizer['params'] = dict(lr=args.learning_rate,
                                   alpha=args.decay,
                                   eps=args.epsilon,
                                   centered=True)
    else:
        raise ValueError

    # DQN learning run

    # Settings
    if args.debug:
        initial_replay_size = args.batch_size
        max_replay_size = 500
        train_frequency = 5
        target_update_frequency = 10
        test_samples = 20
        evaluation_frequency = 50
        max_steps = 1000
    else:
        initial_replay_size = args.initial_replay_size
        max_replay_size = args.max_replay_size
        train_frequency = args.train_frequency
        target_update_frequency = args.target_update_frequency
        test_samples = args.test_samples
        evaluation_frequency = args.evaluation_frequency
        max_steps = args.max_steps

    # Policy
    epsilon = LinearParameter(value=args.initial_exploration_rate,
                              threshold_value=args.final_exploration_rate,
                              n=args.final_exploration_frame)
    epsilon_test = Parameter(value=args.test_exploration_rate)
    epsilon_random = Parameter(value=1)
    pi = EpsGreedyMultiple(parameter=epsilon,
                           n_actions_per_head=n_actions_per_head)

    # Approximator
    n_games = len(args.games)
    loss = LossFunction(n_games, args.batch_size,
                        args.evaluation_frequency)

    input_shape = (args.history_length, args.screen_height,
                   args.screen_width)
    approximator_params = dict(
        network=AtariNetwork,
        input_shape=input_shape,
        output_shape=(max(n_actions_per_head)[0],),
        n_actions=max(n_actions_per_head)[0],
        n_actions_per_head=n_actions_per_head,
        n_games=len(args.games),
        optimizer=optimizer,
        loss=loss,
        use_cuda=args.use_cuda,
        features=args.features
    )

    approximator = TorchApproximator

    if args.prioritized:
        replay_memory = [PrioritizedReplayMemory(
            initial_replay_size, max_replay_size, alpha=.6,
            beta=LinearParameter(.4, threshold_value=1,
                                 n=max_steps // train_frequency)
        ) for _ in range(n_games)]
    else:
        replay_memory = None

    # Agent
    algorithm_params = dict(
        batch_size=args.batch_size,
        n_games=len(args.games),
        initial_replay_size=initial_replay_size,
        max_replay_size=max_replay_size,
        target_update_frequency=target_update_frequency // train_frequency,
        replay_memory=replay_memory,
        n_actions_per_head=n_actions_per_head,
        clip_reward=True,
        history_length=args.history_length
    )

    if args.algorithm == 'dqn':
        agent = DQN(approximator, pi, mdp_info,
                    approximator_params=approximator_params,
                    **algorithm_params)
    elif args.algorithm == 'ddqn':
        agent = DoubleDQN(approximator, pi, mdp_info,
                          approximator_params=approximator_params,
                          **algorithm_params)

    # Algorithm
    core = Core(agent, mdp)

    # RUN

    # Fill replay memory with random dataset
    print_epoch(0)
    pi.set_parameter(epsilon_random)
    core.learn(n_steps=initial_replay_size,
               n_steps_per_fit=initial_replay_size, quiet=args.quiet)

    if args.transfer:
        weights = pickle.load(open(args.transfer, 'rb'))
        agent.set_shared_weights(weights)

    if args.load:
        weights = np.load(args.load)
        agent.approximator.set_weights(weights)

    # Evaluate initial policy
    pi.set_parameter(epsilon_test)
    dataset = core.evaluate(n_steps=test_samples, render=args.render,
                            quiet=args.quiet)
    for i in range(len(mdp)):
        d = dataset[i::len(mdp)]
        scores[i].append(get_stats(d, i, args.games)[2])

    if args.unfreeze_epoch > 0:
        agent.freeze_shared_weights()

    best_score_sum = -np.inf
    best_weights = None

    np.save(folder_name + 'scores-exp-%d.npy' % idx, scores)
    np.save(folder_name + 'loss-exp-%d.npy' % idx,
            agent.approximator.model._loss.get_losses())

    for n_epoch in range(1, max_steps // evaluation_frequency + 1):
        if n_epoch >= args.unfreeze_epoch > 0:
            agent.unfreeze_shared_weights()

        print_epoch(n_epoch)
        print('- Learning:')
        # learning step
        pi.set_parameter(None)
        core.learn(n_steps=evaluation_frequency,
                   n_steps_per_fit=train_frequency, quiet=args.quiet)

        print('- Evaluation:')
        # evaluation step
        pi.set_parameter(epsilon_test)
        dataset = core.evaluate(n_steps=test_samples,
                                render=args.render, quiet=args.quiet)

        current_score_sum = 0
        for i in range(len(mdp)):
            d = dataset[i::len(mdp)]
            current_score = get_stats(d, i, args.games)[2]
            scores[i].append(current_score)
            current_score_sum += current_score

        # Save shared weights if best score
        if args.save_shared and current_score_sum >= best_score_sum:
            best_score_sum = current_score_sum
            best_weights = agent.get_shared_weights()

        if args.save:
            np.save(folder_name + 'weights-exp-%d-%d.npy' % (idx, n_epoch),
                    agent.approximator.get_weights())

        np.save(folder_name + 'scores-exp-%d.npy' % idx, scores)
        np.save(folder_name + 'loss-exp-%d.npy' % idx,
                agent.approximator.model._loss.get_losses())

    if args.save_shared:
        pickle.dump(best_weights, open(args.save_shared, 'wb'))

    return scores, agent.approximator.model._loss.get_losses()
Exemple #14
0
def experiment():
    np.random.seed()

    # Argument parser
    parser = argparse.ArgumentParser()

    arg_mem = parser.add_argument_group('Replay Memory')
    arg_mem.add_argument("--initial-replay-size",
                         type=int,
                         default=50000,
                         help='Initial size of the replay memory.')
    arg_mem.add_argument("--max-replay-size",
                         type=int,
                         default=500000,
                         help='Max size of the replay memory.')
    arg_mem.add_argument("--prioritized",
                         action='store_true',
                         help='Whether to use prioritized memory or not.')

    arg_net = parser.add_argument_group('Deep Q-Network')
    arg_net.add_argument(
        "--optimizer",
        choices=['adadelta', 'adam', 'rmsprop', 'rmspropcentered'],
        default='adam',
        help='Name of the optimizer to use.')
    arg_net.add_argument("--learning-rate",
                         type=float,
                         default=.0001,
                         help='Learning rate value of the optimizer.')
    arg_net.add_argument("--decay",
                         type=float,
                         default=.95,
                         help='Discount factor for the history coming from the'
                         'gradient momentum in rmspropcentered and'
                         'rmsprop')
    arg_net.add_argument("--epsilon",
                         type=float,
                         default=1e-8,
                         help='Epsilon term used in rmspropcentered and'
                         'rmsprop')

    arg_alg = parser.add_argument_group('Algorithm')
    arg_alg.add_argument("--algorithm",
                         choices=[
                             'dqn', 'ddqn', 'adqn', 'mmdqn', 'cdqn', 'dueldqn',
                             'ndqn', 'rainbow'
                         ],
                         default='dqn',
                         help='Name of the algorithm. dqn is for standard'
                         'DQN, ddqn is for Double DQN and adqn is for'
                         'Averaged DQN.')
    arg_alg.add_argument(
        "--n-approximators",
        type=int,
        default=1,
        help="Number of approximators used in the ensemble for"
        "AveragedDQN or MaxminDQN.")
    arg_alg.add_argument("--batch-size",
                         type=int,
                         default=32,
                         help='Batch size for each fit of the network.')
    arg_alg.add_argument("--history-length",
                         type=int,
                         default=4,
                         help='Number of frames composing a state.')
    arg_alg.add_argument("--target-update-frequency",
                         type=int,
                         default=10000,
                         help='Number of collected samples before each update'
                         'of the target network.')
    arg_alg.add_argument("--evaluation-frequency",
                         type=int,
                         default=250000,
                         help='Number of collected samples before each'
                         'evaluation. An epoch ends after this number of'
                         'steps')
    arg_alg.add_argument("--train-frequency",
                         type=int,
                         default=4,
                         help='Number of collected samples before each fit of'
                         'the neural network.')
    arg_alg.add_argument("--max-steps",
                         type=int,
                         default=5000000,
                         help='Total number of collected samples.')
    arg_alg.add_argument(
        "--final-exploration-frame",
        type=int,
        default=10000000,
        help='Number of collected samples until the exploration'
        'rate stops decreasing.')
    arg_alg.add_argument("--initial-exploration-rate",
                         type=float,
                         default=1.,
                         help='Initial value of the exploration rate.')
    arg_alg.add_argument("--final-exploration-rate",
                         type=float,
                         default=.1,
                         help='Final value of the exploration rate. When it'
                         'reaches this values, it stays constant.')
    arg_alg.add_argument("--test-exploration-rate",
                         type=float,
                         default=.05,
                         help='Exploration rate used during evaluation.')
    arg_alg.add_argument("--test-episodes",
                         type=int,
                         default=5,
                         help='Number of episodes for each evaluation.')
    arg_alg.add_argument(
        "--alpha-coeff",
        type=float,
        default=.6,
        help='Prioritization exponent for prioritized experience replay.')
    arg_alg.add_argument("--n-atoms",
                         type=int,
                         default=51,
                         help='Number of atoms for Categorical DQN.')
    arg_alg.add_argument("--v-min",
                         type=int,
                         default=-10,
                         help='Minimum action-value for Categorical DQN.')
    arg_alg.add_argument("--v-max",
                         type=int,
                         default=10,
                         help='Maximum action-value for Categorical DQN.')
    arg_alg.add_argument("--n-steps-return",
                         type=int,
                         default=3,
                         help='Number of steps for n-step return for Rainbow.')
    arg_alg.add_argument("--sigma-coeff",
                         type=float,
                         default=.5,
                         help='Sigma0 coefficient for noise initialization in'
                         'NoisyDQN and Rainbow.')

    arg_utils = parser.add_argument_group('Utils')
    arg_utils.add_argument('--use-cuda',
                           action='store_true',
                           help='Flag specifying whether to use the GPU.')
    arg_utils.add_argument('--save',
                           action='store_true',
                           help='Flag specifying whether to save the model.')
    arg_utils.add_argument('--load-path',
                           type=str,
                           help='Path of the model to be loaded.')
    arg_utils.add_argument('--render',
                           action='store_true',
                           help='Flag specifying whether to render the grid.')
    arg_utils.add_argument('--quiet',
                           action='store_true',
                           help='Flag specifying whether to hide the progress'
                           'bar.')
    arg_utils.add_argument('--debug',
                           action='store_true',
                           help='Flag specifying whether the script has to be'
                           'run in debug mode.')

    args = parser.parse_args()

    scores = list()

    optimizer = dict()
    if args.optimizer == 'adam':
        optimizer['class'] = optim.Adam
        optimizer['params'] = dict(lr=args.learning_rate, eps=args.epsilon)
    elif args.optimizer == 'adadelta':
        optimizer['class'] = optim.Adadelta
        optimizer['params'] = dict(lr=args.learning_rate, eps=args.epsilon)
    elif args.optimizer == 'rmsprop':
        optimizer['class'] = optim.RMSprop
        optimizer['params'] = dict(lr=args.learning_rate,
                                   alpha=args.decay,
                                   eps=args.epsilon)
    elif args.optimizer == 'rmspropcentered':
        optimizer['class'] = optim.RMSprop
        optimizer['params'] = dict(lr=args.learning_rate,
                                   alpha=args.decay,
                                   eps=args.epsilon,
                                   centered=True)
    else:
        raise ValueError

    # Summary folder
    folder_name = './logs/habitat_nav_' + args.algorithm +\
        '_' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    pathlib.Path(folder_name).mkdir(parents=True)

    # Settings
    if args.debug:
        initial_replay_size = 50
        max_replay_size = 500
        train_frequency = 5
        target_update_frequency = 10
        test_episodes = 20
        evaluation_frequency = 50
        max_steps = 1000
    else:
        initial_replay_size = args.initial_replay_size
        max_replay_size = args.max_replay_size
        train_frequency = args.train_frequency
        target_update_frequency = args.target_update_frequency
        test_episodes = args.test_episodes
        evaluation_frequency = args.evaluation_frequency
        max_steps = args.max_steps

    # MDP
    config_file = os.path.join(
        pathlib.Path(__file__).parent.resolve(),
        'pointnav_apartment-0.yaml')  # Custom task for Replica scenes
    wrapper = 'HabitatNavigationWrapper'
    mdp = Habitat(wrapper, config_file)
    opt_return = mdp.env.get_optimal_policy_return()

    if args.load_path:
        logger = Logger(DQN.__name__, results_dir=None)
        logger.strong_line()
        logger.info('Optimal Policy Undiscounted Return: ' + str(opt_return))
        logger.info('Experiment Algorithm: ' + DQN.__name__)

        # Agent
        agent = DQN.load(args.load_path)
        epsilon_test = Parameter(value=args.test_exploration_rate)
        agent.policy.set_epsilon(epsilon_test)

        # Algorithm
        core_test = Core(agent, mdp)

        # Evaluate model
        dataset = core_test.evaluate(n_episodes=args.test_episodes,
                                     render=args.render,
                                     quiet=args.quiet)
        get_stats(dataset, logger)
    else:
        # Policy
        epsilon = LinearParameter(value=args.initial_exploration_rate,
                                  threshold_value=args.final_exploration_rate,
                                  n=args.final_exploration_frame)
        epsilon_test = Parameter(value=args.test_exploration_rate)
        epsilon_random = Parameter(value=1)
        pi = EpsGreedy(epsilon=epsilon_random)

        # Approximator
        approximator_params = dict(
            network=Network if args.algorithm
            not in ['dueldqn', 'cdqn', 'ndqn', 'rainbow'] else FeatureNetwork,
            input_shape=mdp.info.observation_space.shape,
            output_shape=(mdp.info.action_space.n, ),
            n_actions=mdp.info.action_space.n,
            n_features=Network.n_features,
            optimizer=optimizer,
            use_cuda=args.use_cuda)
        if args.algorithm not in ['cdqn', 'rainbow']:
            approximator_params['loss'] = F.smooth_l1_loss

        approximator = TorchApproximator

        if args.prioritized:
            replay_memory = PrioritizedReplayMemory(
                initial_replay_size,
                max_replay_size,
                alpha=args.alpha_coeff,
                beta=LinearParameter(.4,
                                     threshold_value=1,
                                     n=max_steps // train_frequency))
        else:
            replay_memory = None

        # Agent
        algorithm_params = dict(
            batch_size=args.batch_size,
            target_update_frequency=target_update_frequency // train_frequency,
            replay_memory=replay_memory,
            initial_replay_size=initial_replay_size,
            max_replay_size=max_replay_size)

        if args.algorithm == 'dqn':
            alg = DQN
            agent = alg(mdp.info,
                        pi,
                        approximator,
                        approximator_params=approximator_params,
                        **algorithm_params)
        elif args.algorithm == 'ddqn':
            alg = DoubleDQN
            agent = alg(mdp.info,
                        pi,
                        approximator,
                        approximator_params=approximator_params,
                        **algorithm_params)
        elif args.algorithm == 'adqn':
            alg = AveragedDQN
            agent = alg(mdp.info,
                        pi,
                        approximator,
                        approximator_params=approximator_params,
                        n_approximators=args.n_approximators,
                        **algorithm_params)
        elif args.algorithm == 'mmdqn':
            alg = MaxminDQN
            agent = alg(mdp.info,
                        pi,
                        approximator,
                        approximator_params=approximator_params,
                        n_approximators=args.n_approximators,
                        **algorithm_params)
        elif args.algorithm == 'dueldqn':
            alg = DuelingDQN
            agent = alg(mdp.info,
                        pi,
                        approximator_params=approximator_params,
                        **algorithm_params)
        elif args.algorithm == 'cdqn':
            alg = CategoricalDQN
            agent = alg(mdp.info,
                        pi,
                        approximator_params=approximator_params,
                        n_atoms=args.n_atoms,
                        v_min=args.v_min,
                        v_max=args.v_max,
                        **algorithm_params)
        elif args.algorithm == 'ndqn':
            alg = NoisyDQN
            agent = alg(mdp.info,
                        pi,
                        approximator_params=approximator_params,
                        sigma_coeff=args.sigma_coeff,
                        **algorithm_params)
        elif args.algorithm == 'rainbow':
            alg = Rainbow
            beta = LinearParameter(.4,
                                   threshold_value=1,
                                   n=max_steps // train_frequency)
            agent = alg(mdp.info,
                        pi,
                        approximator_params=approximator_params,
                        n_atoms=args.n_atoms,
                        v_min=args.v_min,
                        v_max=args.v_max,
                        n_steps_return=args.n_steps_return,
                        alpha_coeff=args.alpha_coeff,
                        beta=beta,
                        sigma_coeff=args.sigma_coeff,
                        **algorithm_params)

        logger = Logger(alg.__name__, results_dir=None)
        logger.strong_line()
        logger.info('Optimal Policy Undiscounted Return: ' + str(opt_return))
        logger.info('Experiment Algorithm: ' + alg.__name__)

        # Algorithm
        core = Core(agent, mdp)

        # RUN

        # Fill replay memory with random dataset
        print_epoch(0, logger)
        core.learn(n_steps=initial_replay_size,
                   n_steps_per_fit=initial_replay_size,
                   quiet=args.quiet)

        if args.save:
            agent.save(folder_name + '/agent_0.msh')

        # Evaluate initial policy
        pi.set_epsilon(epsilon_test)
        dataset = core.evaluate(n_episodes=test_episodes,
                                render=args.render,
                                quiet=args.quiet)
        scores.append(get_stats(dataset, logger))

        np.save(folder_name + '/scores.npy', scores)
        for n_epoch in range(1, max_steps // evaluation_frequency + 1):
            print_epoch(n_epoch, logger)
            logger.info('- Learning:')
            # learning step
            pi.set_epsilon(epsilon)
            core.learn(n_steps=evaluation_frequency,
                       n_steps_per_fit=train_frequency,
                       quiet=args.quiet)

            if args.save:
                agent.save(folder_name + '/agent_' + str(n_epoch) + '.msh')

            logger.info('- Evaluation:')
            # evaluation step
            pi.set_epsilon(epsilon_test)
            dataset = core.evaluate(n_episodes=test_episodes,
                                    render=args.render,
                                    quiet=args.quiet)
            scores.append(get_stats(dataset, logger))

            np.save(folder_name + '/scores.npy', scores)

    return scores
Exemple #15
0
def experiment(n_epochs, n_steps, n_steps_test):
    np.random.seed()

    # MDP
    horizon = 1000
    gamma = 0.99
    gamma_eval = 1.
    mdp = Gym('Acrobot-v1', horizon, gamma)

    # Policy
    epsilon = LinearParameter(value=1., threshold_value=.01, n=5000)
    epsilon_test = Parameter(value=0.)
    epsilon_random = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon_random)

    # Settings
    initial_replay_size = 500
    max_replay_size = 5000
    target_update_frequency = 100
    batch_size = 200
    n_features = 80
    train_frequency = 1

    # Approximator
    input_shape = mdp.info.observation_space.shape
    approximator_params = dict(network=Network,
                               optimizer={
                                   'class': optim.Adam,
                                   'params': {
                                       'lr': .001
                                   }
                               },
                               loss=F.smooth_l1_loss,
                               n_features=n_features,
                               input_shape=input_shape,
                               output_shape=mdp.info.action_space.size,
                               n_actions=mdp.info.action_space.n)

    # Agent
    agent = DQN(mdp.info,
                pi,
                TorchApproximator,
                approximator_params=approximator_params,
                batch_size=batch_size,
                n_approximators=1,
                initial_replay_size=initial_replay_size,
                max_replay_size=max_replay_size,
                target_update_frequency=target_update_frequency)

    # Algorithm
    core = Core(agent, mdp)

    core.learn(n_steps=initial_replay_size,
               n_steps_per_fit=initial_replay_size)

    # RUN
    pi.set_epsilon(epsilon_test)
    dataset = core.evaluate(n_steps=n_steps_test, render=False)
    J = compute_J(dataset, gamma_eval)
    print('J: ', np.mean(J))

    for n in trange(n_epochs):
        tqdm.write('Epoch: ' + str(n))
        pi.set_epsilon(epsilon)
        core.learn(n_steps=n_steps, n_steps_per_fit=train_frequency)
        pi.set_epsilon(epsilon_test)
        dataset = core.evaluate(n_steps=n_steps_test, render=False)
        J = compute_J(dataset, gamma_eval)
        tqdm.write('J: ' + str(np.mean(J)))

    print('Press a button to visualize acrobot')
    input()
    core.evaluate(n_episodes=5, render=True)
def experiment(mdp, params, prob=None):
    # Argument parser
    # parser = argparse.ArgumentParser()
    #
    # args = parser.parse_args()

    scores = list()

    optimizer = dict()
    if params['optimizer'] == 'adam':
        optimizer['class'] = optim.Adam
        optimizer['params'] = dict(lr=params['learning_rate'])
    elif params['optimizer'] == 'adadelta':
        optimizer['class'] = optim.Adadelta
        optimizer['params'] = dict(lr=params['learning_rate'])
    elif params['optimizer'] == 'rmsprop':
        optimizer['class'] = optim.RMSprop
        optimizer['params'] = dict(lr=params['learning_rate'],
                                   alpha=params['decay'])
    elif params['optimizer'] == 'rmspropcentered':
        optimizer['class'] = optim.RMSprop
        optimizer['params'] = dict(lr=params['learning_rate'],
                                   alpha=params['decay'],
                                   centered=True)
    else:
        raise ValueError

    # DQN learning run

    # Summary folder
    folder_name = os.path.join(PROJECT_DIR, 'logs', params['name'])
    if params['save']:
        pathlib.Path(folder_name).mkdir(parents=True)

    # Policy
    epsilon = ExponentialParameter(value=params['initial_exploration_rate'],
                                   exp=params['exploration_rate'],
                                   min_value=params['final_exploration_rate'],
                                   size=(1, ))

    epsilon_random = Parameter(value=1)
    epsilon_test = Parameter(value=0.01)
    pi = EpsGreedy(epsilon=epsilon_random)

    class CategoricalLoss(nn.Module):
        def forward(self, input, target):
            input = input.clamp(1e-5)

            return -torch.sum(target * torch.log(input))

    # Approximator
    input_shape = mdp.observation.shape

    resources = [[
        (mdp.north - mdp.devices[device][0]) / (mdp.north - mdp.south),
        (mdp.east - mdp.devices[device][1]) / (mdp.east - mdp.west)
    ] for device in mdp.device_ordering]
    edges = [[
        (mdp.north - mdp.graph.nodes[e[0]]['y']) / (mdp.north - mdp.south),
        (mdp.east - mdp.graph.nodes[e[0]]['x']) / (mdp.east - mdp.west),
        (mdp.north - mdp.graph.nodes[e[1]]['y']) / (mdp.north - mdp.south),
        (mdp.east - mdp.graph.nodes[e[1]]['x']) / (mdp.east - mdp.west)
    ] for e in mdp.graph.edges]

    N = {
        'SimpleResourceNetwork': SimpleResourceNetwork,
        'GraphConvolutionResourceNetwork': GraphConvolutionResourceNetwork,
    }[params['network']]
    N.n_features = params['hidden']

    approximator_params = dict(
        network=N,
        input_shape=input_shape,
        edges=edges,
        resources=resources,
        graph=mdp.graph,
        allow_wait=params['allow_wait'],
        long_term_q=params['long_term_q'],
        resource_embeddings=params['resource_embeddings'],
        edge_ordering=mdp.edge_ordering,
        device_ordering=mdp.device_ordering,
        resource_edges=mdp.resource_edges,
        output_shape=(mdp.info.action_space.n, ),
        n_actions=mdp.info.action_space.n,
        n_features=params['hidden'],
        optimizer=optimizer,
        loss=F.smooth_l1_loss,
        nn_scaling=params['nn_scaling'],
        # quiet=False,
        use_cuda=params['cuda'],
        load_path=params.get('load_path', None))

    approximator = TorchApproximator

    replay_memory = PrioritizedReplayMemory(
        params['initial_replay_size'],
        params['max_replay_size'],
        alpha=.6,
        beta=LinearParameter(.4,
                             threshold_value=1,
                             n=params['max_steps'] //
                             params['train_frequency']))

    # Agent
    algorithm_params = dict(
        batch_size=params['batch_size'],
        n_approximators=1,
        target_update_frequency=params['target_update_frequency'] //
        params['train_frequency'],
        replay_memory=replay_memory,
        initial_replay_size=params['initial_replay_size'],
        max_replay_size=params['max_replay_size'])

    clz = DoubleDQN if mdp.info.gamma >= 1 else SMDPDQN
    agent = clz(mdp.info,
                pi,
                approximator,
                approximator_params=approximator_params,
                **algorithm_params)

    lr_scheduler = torch.optim.lr_scheduler.StepLR(
        agent.approximator._impl.model._optimizer,
        step_size=1,
        gamma=params['lr_decay'],
        last_epoch=-1)  # params['max_steps'] // params['train_frequency']

    # Algorithm
    core = Core(agent, mdp)

    if 'weights' in params:
        best_weights = np.load(params['weights'])
        agent.approximator.set_weights(best_weights)
        agent.target_approximator.set_weights(best_weights)
    else:
        best_weights = agent.approximator.get_weights()

    # RUN
    pi.set_epsilon(epsilon_test)
    eval_days = [i for i in range(1, 356) if i % 13 == 1]
    ds = core.evaluate(initial_states=eval_days,
                       quiet=tuning,
                       render=params['save'])
    test_result = np.mean(compute_J(ds))
    test_result_discounted = np.mean(compute_J(ds, params['gamma']))
    print("discounted validation result", test_result_discounted)
    print("validation result", test_result)
    results = [(0, 0, test_result_discounted, test_result)]
    if params['save']:
        mdp.save_rendered(folder_name + "/epoch_init.mp4")

    # Fill replay memory with random dataset
    print_epoch(0)
    start = time()
    core.learn(n_steps=params['initial_replay_size'],
               n_steps_per_fit=params['initial_replay_size'],
               quiet=tuning)

    runtime = time() - start
    steps = 0

    if params['save']:
        with open(folder_name + "/params.json", "w") as f:
            json.dump(params, f, indent=4)
        if isinstance(agent, DQN):
            np.save(folder_name + '/weights-exp-0-0.npy',
                    agent.approximator.get_weights())

    best_score = -np.inf
    no_improvement = 0
    patience = 6

    if params['save']:
        np.save(folder_name + '/scores.npy', scores)
    for n_epoch in range(
            1, int(params['max_steps'] // params['evaluation_frequency'] + 1)):
        print_epoch(n_epoch)
        print('- Learning:')
        # learning step
        pi.set_epsilon(epsilon)
        # mdp.set_episode_end(True)
        start = time()
        core.learn(n_steps=params['evaluation_frequency'],
                   n_steps_per_fit=params['train_frequency'],
                   quiet=tuning)
        runtime += time() - start
        steps += params['evaluation_frequency']
        lr_scheduler.step()

        if params['save']:
            if isinstance(agent, DQN):
                np.save(
                    folder_name + '/weights-exp-0-' + str(n_epoch) + '.npy',
                    agent.approximator.get_weights())

        print('- Evaluation:')
        # evaluation step
        pi.set_epsilon(epsilon_test)
        ds = core.evaluate(initial_states=eval_days,
                           render=params['save'],
                           quiet=tuning)
        test_result_discounted = np.mean(compute_J(ds, params['gamma']))
        test_result = np.mean(compute_J(ds))
        print("discounted validation result", test_result_discounted)
        print("validation result", test_result)

        if params['save']:
            mdp.save_rendered(folder_name + ("/epoch%04d.mp4" % n_epoch))
        results.append((runtime, steps, test_result_discounted, test_result))

        if params['save']:
            np.savetxt(folder_name + '/scores.csv',
                       np.asarray(results),
                       delimiter=',')

        if test_result > best_score:
            no_improvement = 0
            best_score = test_result
            best_weights = agent.approximator.get_weights().copy()

            with open(folder_name + "/best_val.txt", "w") as f:
                f.write("%f" % test_result)
        else:
            no_improvement += 1
            if no_improvement >= patience:
                break

    print('---------- FINAL EVALUATION ---------')
    agent.approximator.set_weights(best_weights)
    agent.target_approximator.set_weights(best_weights)
    pi.set_epsilon(epsilon_test)
    eval_days = [i for i in range(1, 356) if i % 13 == 0]
    ds = core.evaluate(initial_states=eval_days,
                       render=params['save'],
                       quiet=tuning)
    test_result_discounted = np.mean(compute_J(ds, params['gamma']))
    test_result = np.mean(compute_J(ds))
    print("discounted test result", test_result_discounted)
    print("test result", test_result)

    with open(folder_name + "/test_result.txt", "w") as f:
        f.write("%f" % test_result)

    if params['save']:
        mdp.save_rendered(folder_name + "/epoch_test.mp4", 10000)

    return scores