Example #1
0
def experiment(policy, value):
    np.random.seed(45)

    # MDP
    mdp = generate_taxi('tests/taxi/grid.txt', rew=(0, 1, 5))

    # Policy
    pi = policy(Parameter(value=value))

    # Agent
    learning_rate = Parameter(value=.15)
    algorithm_params = dict(learning_rate=learning_rate)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = SARSA(pi, mdp.info, agent_params)

    # Algorithm
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset]
    core = Core(agent, mdp, callbacks)

    # Train
    n_steps = 2000
    core.learn(n_steps=n_steps, n_steps_per_fit=1, quiet=True)

    return np.sum(np.array(collect_dataset.get())[:, 2]) / float(n_steps)
Example #2
0
def experiment(policy, value):
    np.random.seed()

    # MDP
    mdp = generate_taxi('grid.txt')

    # Policy
    pi = policy(Parameter(value=value))

    # Agent
    learning_rate = Parameter(value=.15)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = SARSA(pi, mdp.info, **algorithm_params)

    # Algorithm
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset]
    core = Core(agent, mdp, callbacks)

    # Train
    n_steps = 300000
    core.learn(n_steps=n_steps, n_steps_per_fit=1, quiet=True)

    return np.sum(np.array(collect_dataset.get())[:, 2]) / float(n_steps)
Example #3
0
def experiment(policy, name, alg_version):
    np.random.seed()

    # MDP

    if name == "Taxi":
        mdp = generate_taxi('../grid.txt')
        max_steps = 100000
        evaluation_frequency = 2000
        test_samples = 10000
    elif name == "NChain-v0":
        mdp = generate_chain(horizon=1000)
        max_steps = 5000
        evaluation_frequency = 100
        test_samples = 10000
    elif name == "Loop":
        mdp = generate_loop(horizon=1000)
        max_steps = 5000
        evaluation_frequency = 100
        test_samples = 10000
    elif name == "SixArms":
        mdp = generate_arms(horizon=1000)
        max_steps = 25000
        evaluation_frequency = 500
        test_samples = 10000
    elif name == "RiverSwim":
        mdp = generate_river(horizon=1000)
        max_steps = 5000
        evaluation_frequency = 100
        test_samples = 10000
    else:
        raise NotImplementedError
    # Policy
    # epsilon = ExponentialDecayParameter(value=1., decay_exp=.5,
    #                                     size=mdp.info.observation_space.size)
    epsilon_train = ExponentialDecayParameter(
        value=1., decay_exp=.5, size=mdp.info.observation_space.size)
    epsilon_test = Parameter(0)
    pi = policy(epsilon=epsilon_train)

    # Agent
    learning_rate = ExponentialDecayParameter(value=1.,
                                              decay_exp=.2,
                                              size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = QLearning(pi, mdp.info, **algorithm_params)

    # Algorithm
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset]
    core = Core(agent, mdp, callbacks)
    scores = list()
    scores_train = list()
    # Train
    for n_epoch in range(1, max_steps // evaluation_frequency + 1):
        print('- Learning:')
        # learning step
        pi.set_epsilon(epsilon_train)
        core.learn(n_steps=evaluation_frequency,
                   n_steps_per_fit=1,
                   quiet=False)
        dataset = collect_dataset.get()
        if name == "Taxi":
            scores_train.append(get_stats(dataset))
        elif name in ["SixArms"]:
            scores_train.append(compute_scores_Loop(dataset, horizon=500))
        else:
            scores_train.append(compute_scores_Loop(dataset))
        collect_dataset.clean()
        mdp.reset()
        print('- Evaluation:')
        # evaluation step
        pi.set_epsilon(epsilon_test)
        dataset = core.evaluate(n_steps=test_samples, quiet=False)
        mdp.reset()
        scores.append(get_stats(dataset))
        #np.save(env + '/'+alg_version+'_scores.npy', scores)

    return scores_train, scores
Example #4
0
def experiment(args, agent_algorithm):
    np.random.seed()

    scores = list()
    #add timestamp to results
    ts = str(time.time())
    # Evaluation of the model provided by the user.
    if args.load_path and args.evaluation:
        # MDP
        if args.name not in ['Taxi', 'Gridworld']:
            mdp = Gym(args.name, args.horizon, args.gamma)
            n_states = None
            gamma_eval = 1.
        elif args.name == 'Taxi':
            mdp = generate_taxi('../../grid.txt')
            n_states = mdp.info.observation_space.size[0]
            gamma_eval = mdp.info.gamma
        else:
            rew_weights = [args.fast_zone, args.slow_zone, args.goal]
            grid_size = args.grid_size
            env = GridWorld(gamma=args.gamma,
                            rew_weights=rew_weights,
                            shape=(grid_size, grid_size),
                            randomized_initial=args.rand_initial,
                            horizon=args.horizon)
            gamma_eval = args.gamma
            mdp = env.generate_mdp()
            n_states = mdp.info.observation_space.size[0]
        # Policy
        epsilon_test = Parameter(value=args.test_exploration_rate)
        pi = BootPolicy(args.n_approximators, epsilon=epsilon_test)

        # Approximator
        input_shape = mdp.info.observation_space.shape + (1, )
        input_preprocessor = list()
        approximator_params = dict(input_shape=input_shape,
                                   output_shape=(mdp.info.action_space.n, ),
                                   n_states=n_states,
                                   n_actions=mdp.info.action_space.n,
                                   n_features=args.n_features,
                                   n_approximators=args.n_approximators,
                                   input_preprocessor=input_preprocessor,
                                   name='test',
                                   load_path=args.load_path,
                                   net_type=args.net_type,
                                   optimizer={
                                       'name': args.optimizer,
                                       'lr': args.learning_rate,
                                       'lr_sigma': args.learning_rate,
                                       'decay': args.decay,
                                       'epsilon': args.epsilon
                                   })

        approximator = SimpleNet

        # Agent
        algorithm_params = dict(batch_size=0,
                                initial_replay_size=0,
                                max_replay_size=0,
                                clip_reward=False,
                                target_update_frequency=1)
        if args.alg == 'boot':
            algorithm_params['p_mask'] = args.p_mask
            pi = BootPolicy(args.n_approximators, epsilon=epsilon_test)
        elif args.alg == 'gaussian':
            if args.ucb:
                pi = UCBPolicy(delta=args.delta, q_max=1. / (1. - args.gamma))
            else:
                pi = WeightedGaussianPolicy(epsilon=epsilon_test)
        elif args.alg == 'dqn':
            pi = EpsGreedy(epsilon=epsilon_test)
        elif args.alg == 'particle':
            if args.ucb:
                pi = UCBPolicy(delta=args.delta, q_max=1. / (1. - args.gamma))
            else:
                pi = WeightedPolicy(args.n_approximators, epsilon=epsilon_test)

        else:
            raise ValueError("Algorithm uknown")

        if args.alg in ['gaussian', 'particle']:
            algorithm_params['update_type'] = args.update_type
            algorithm_params['delta'] = args.delta
            algorithm_params['store_prob'] = args.store_prob
            if args.clip_target:
                algorithm_params['max_spread'] = args.q_max - args.q_min
            approximator_params['q_min'] = args.q_min
            approximator_params['q_max'] = args.q_max
            approximator_params['loss'] = args.loss
            approximator_params['init_type'] = args.init_type
            approximator_params['sigma_weight'] = args.sigma_weight
        if args.alg in ['particle', 'boot']:
            approximator_params['n_approximators'] = args.n_approximators
            algorithm_params['n_approximators'] = args.n_approximators
        agent = agent_algorithm(approximator,
                                pi,
                                mdp.info,
                                approximator_params=approximator_params,
                                **algorithm_params)

        # Algorithm
        core_test = Core(agent, mdp)

        # Evaluate model
        pi.set_eval(True)
        dataset = core_test.evaluate(n_steps=args.test_samples,
                                     render=args.render,
                                     quiet=args.quiet)
        get_stats(dataset)
    else:
        # DQN learning run
        print("Learning Run")

        # Settings
        if args.debug:
            initial_replay_size = 50
            max_replay_size = 500
            train_frequency = 5
            target_update_frequency = 10
            test_samples = 20
            evaluation_frequency = 50
            max_steps = 1000
        else:
            initial_replay_size = args.initial_replay_size
            max_replay_size = args.max_replay_size
            train_frequency = args.train_frequency
            target_update_frequency = args.target_update_frequency
            test_samples = args.test_samples
            evaluation_frequency = args.evaluation_frequency
            max_steps = args.max_steps

        # MDP
        if args.name not in ['Taxi', 'Gridworld']:
            mdp = Gym(args.name, args.horizon, args.gamma)
            n_states = None
            gamma_eval = 1.
        elif args.name == 'Taxi':
            mdp = generate_taxi('../../grid.txt')
            n_states = mdp.info.observation_space.size[0]
            gamma_eval = mdp.info.gamma
        else:
            rew_weights = [args.fast_zone, args.slow_zone, args.goal]
            grid_size = args.grid_size
            env = GridWorld(gamma=args.gamma,
                            rew_weights=rew_weights,
                            shape=(grid_size, grid_size),
                            randomized_initial=args.rand_initial,
                            horizon=args.horizon)
            mdp = env.generate_mdp()
            n_states = mdp.info.observation_space.size[0]
            print(mdp.info.gamma)
            gamma_eval = args.gamma
        # Policy
        epsilon = LinearDecayParameter(value=args.initial_exploration_rate,
                                       min_value=args.final_exploration_rate,
                                       n=args.final_exploration_frame)
        epsilon_test = Parameter(value=args.test_exploration_rate)
        epsilon_random = Parameter(value=1.)

        policy_name = 'weighted'
        update_rule = args.update_type + "_update"
        if args.alg == 'boot':
            pi = BootPolicy(args.n_approximators, epsilon=epsilon)
            policy_name = 'boot'
            update_rule = 'boot'
        elif args.alg == 'dqn':
            pi = EpsGreedy(epsilon=epsilon)
            policy_name = 'eps_greedy'
            update_rule = 'td'
        elif args.alg == 'particle':
            if args.ucb:
                policy_name = 'ucb'
                pi = UCBPolicy(delta=args.delta, q_max=1. / (1. - args.gamma))
            else:
                pi = WeightedPolicy(args.n_approximators)
        elif args.alg == 'gaussian':
            if args.ucb:
                policy_name = 'ucb'
                pi = UCBPolicy(delta=args.delta, q_max=1. / (1. - args.gamma))
            else:
                pi = WeightedGaussianPolicy()
        else:
            raise ValueError("Algorithm unknown")
        # Summary folder
        folder_name = './logs/' + args.alg + "/" + policy_name + '/' + update_rule + '/' + args.name + "/" + args.loss + "/" + str(
            args.n_approximators
        ) + "_particles" + "/" + args.init_type + "_init" + "/" + str(
            args.learning_rate) + "/" + ts

        # Approximator
        input_shape = mdp.info.observation_space.shape
        input_preprocessor = list()
        approximator_params = dict(input_shape=input_shape,
                                   output_shape=(mdp.info.action_space.n, ),
                                   n_states=n_states,
                                   n_actions=mdp.info.action_space.n,
                                   n_features=args.n_features,
                                   n_approximators=args.n_approximators,
                                   input_preprocessor=input_preprocessor,
                                   folder_name=folder_name,
                                   net_type=args.net_type,
                                   sigma_weight=args.sigma_weight,
                                   optimizer={
                                       'name': args.optimizer,
                                       'lr': args.learning_rate,
                                       'lr_sigma': args.learning_rate,
                                       'decay': args.decay,
                                       'epsilon': args.epsilon
                                   })
        if args.load_path:
            ts = os.path.basename(os.path.normpath(args.load_path))
            approximator_params['load_path'] = args.load_path
            approximator_params['folder_name'] = args.load_path
            folder_name = args.load_path
            p = "scores_" + str(ts) + ".npy"
            scores = np.load(p).tolist()
            max_steps = max_steps - evaluation_frequency * len(scores)
        approximator = SimpleNet

        # Agent
        algorithm_params = dict(
            batch_size=args.batch_size,
            initial_replay_size=initial_replay_size,
            max_replay_size=max_replay_size,
            clip_reward=False,
            target_update_frequency=target_update_frequency // train_frequency,
        )
        if args.alg == 'boot':
            algorithm_params['p_mask'] = args.p_mask
        elif args.alg in ['particle', 'gaussian']:
            algorithm_params['update_type'] = args.update_type
            algorithm_params['delta'] = args.delta
            algorithm_params['store_prob'] = args.store_prob
            if args.clip_target:
                algorithm_params['max_spread'] = args.q_max - args.q_min
            approximator_params['q_min'] = args.q_min
            approximator_params['q_max'] = args.q_max
            approximator_params['loss'] = args.loss
            approximator_params['init_type'] = args.init_type

        if args.alg in ['boot', 'particle']:
            approximator_params['n_approximators'] = args.n_approximators
            algorithm_params['n_approximators'] = args.n_approximators

        agent = agent_algorithm(approximator,
                                pi,
                                mdp.info,
                                approximator_params=approximator_params,
                                **algorithm_params)

        if args.ucb:
            q = agent.approximator
            if args.alg == 'particle':

                def mu(state):
                    q_list = q.predict(state).squeeze()
                    qs = np.array(q_list)
                    return qs.mean(axis=0)

                quantiles = [
                    i * 1. / (args.n_approximators - 1)
                    for i in range(args.n_approximators)
                ]
                for p in range(args.n_approximators):
                    if quantiles[p] >= 1 - args.delta:
                        delta_index = p
                        break

                def quantile_func(state):
                    q_list = q.predict(state).squeeze()

                    qs = np.sort(np.array(q_list), axis=0)
                    return qs[delta_index, :]

                print("Setting up ucb policy")
                pi.set_mu(mu)
                pi.set_quantile_func(quantile_func)

            if args.alg == 'gaussian':
                standard_bound = norm.ppf(1 - args.delta, loc=0, scale=1)

                def mu(state):
                    q_and_sigma = q.predict(state).squeeze()
                    means = q_and_sigma[0]
                    return means

                def quantile_func(state):
                    q_and_sigma = q.predict(state).squeeze()
                    means = q_and_sigma[0]
                    sigmas = q_and_sigma[1]
                    return sigmas * standard_bound + means

                print("Setting up ucb policy")
                pi.set_mu(mu)
                pi.set_quantile_func(quantile_func)
        args.count = 100
        if args.plot_qs:
            import matplotlib.pyplot as plt
            colors = ['red', 'blue', 'green']
            labels = ['left', 'nop', 'right']

            def plot_probs(qs):
                args.count += 1
                if args.count < 1:
                    return
                ax.clear()
                for i in range(qs.shape[-1]):
                    mu = np.mean(qs[..., i], axis=0)
                    sigma = np.std(qs[..., i], axis=0)
                    x = np.linspace(mu - 3 * sigma, mu + 3 * sigma, 20)
                    ax.plot(x,
                            stats.norm.pdf(x, mu, sigma),
                            label=labels[i],
                            color=colors[i])

                ax.set_xlabel('Q-value')
                ax.set_ylabel('Probability')
                ax.set_title('Q-distributions')
                #ax.set_ylim(bottom=0, top=1)

                plt.draw()
                plt.pause(0.02)
                #print("Plotted")
                args.count = 0
                #return probs

            plt.ion()
            fig, ax = plt.subplots()

            plot_probs(
                np.array(agent.approximator.predict(np.array(mdp.reset()))))

            input()
            args.count = 100
            qs = np.array([
                np.linspace(-1000, 0, 10),
                np.linspace(-2000, -1000, 10),
                np.linspace(-750, -250, 10)
            ])
            plot_probs(qs.T)
        # Algorithm
        core = Core(agent, mdp)
        core_test = Core(agent, mdp)

        # RUN

        # Fill replay memory with random dataset

        print_epoch(0)
        core.learn(
            n_steps=initial_replay_size,
            n_steps_per_fit=initial_replay_size,
            quiet=args.quiet,
        )

        if args.save:
            agent.approximator.model.save()

        # Evaluate initial policy
        if hasattr(pi, 'set_eval'):
            pi.set_eval(True)
        pi.set_epsilon(epsilon_test)
        dataset = core_test.evaluate(n_steps=test_samples,
                                     render=args.render,
                                     quiet=args.quiet)
        scores.append(get_stats(dataset))
        if args.plot_qs:
            pi.set_plotter(plot_probs)
        np.save(folder_name + '/scores_' + str(ts) + '.npy', scores)
        for n_epoch in range(1, max_steps // evaluation_frequency + 1):
            print_epoch(n_epoch)
            print('- Learning:')
            # learning step
            if hasattr(pi, 'set_eval'):
                pi.set_eval(False)

            pi.set_epsilon(epsilon)
            # learning step
            if args.plot_qs:
                pi.set_plotter(None)
            core.learn(
                n_steps=evaluation_frequency,
                n_steps_per_fit=train_frequency,
                quiet=args.quiet,
            )

            if args.save:
                agent.approximator.model.save()

            print('- Evaluation:')
            # evaluation step
            if hasattr(pi, 'set_eval'):
                pi.set_eval(True)
            pi.set_epsilon(epsilon_test)
            if args.plot_qs:
                pi.set_plotter(plot_probs)
            dataset = core_test.evaluate(n_steps=test_samples,
                                         render=args.render,
                                         quiet=args.quiet)
            scores.append(get_stats(dataset))
            np.save(folder_name + '/scores_' + str(ts) + '.npy', scores)

    return scores
Example #5
0
def experiment(algorithm, name, update_mode, update_type, policy,
               n_approximators, q_max, q_min, lr_exp, double, file_name,
               out_dir, collect_qs, seed):
    set_global_seeds(seed)
    print('Using seed %s' % seed)

    # MDP
    if name == 'Taxi':
        mdp = generate_taxi('../../grid.txt', horizon=5000)
        max_steps = 500000
        evaluation_frequency = 5000
        test_samples = 5000
    elif name == 'Chain':
        mdp = generate_chain(horizon=100)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'Loop':
        mdp = generate_loop(horizon=100)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'RiverSwim':
        mdp = generate_river(horizon=100)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'SixArms':
        mdp = generate_arms(horizon=100)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'KnightQuest':
        mdp = Gym('KnightQuest-v0', gamma=0.99, horizon=10000)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    else:
        raise NotImplementedError

    epsilon_test = Parameter(0)

    # Agent
    learning_rate = ExponentialDecayParameter(value=1.,
                                              decay_exp=lr_exp,
                                              size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)

    if algorithm == 'ql':
        if policy not in ['boltzmann', 'eps-greedy']:
            warnings.warn(
                'QL available with only boltzmann and eps-greedy policies!')
            policy = 'eps-greedy'

        if policy == 'eps-greedy':
            epsilon_train = ExponentialDecayParameter(
                value=1., decay_exp=.5, size=mdp.info.observation_space.size)
            pi = policy_dict[policy](epsilon=epsilon_train)
        else:
            beta_train = ExponentialDecayParameter(
                value=1.5 * q_max,
                decay_exp=.5,
                size=mdp.info.observation_space.size)
            pi = policy_dict[policy](beta=beta_train)
        if double:
            agent = DoubleQLearning(pi, mdp.info, **algorithm_params)
        else:
            agent = QLearning(pi, mdp.info, **algorithm_params)
    elif algorithm == 'boot-ql':
        if policy not in ['boot', 'weighted']:
            warnings.warn(
                'Bootstrapped QL available with only boot and weighted policies!'
            )
            policy = 'boot'
        pi = policy_dict[policy](n_approximators=n_approximators)
        algorithm_params = dict(n_approximators=n_approximators,
                                mu=(q_max + q_min) / 2,
                                sigma=q_max - q_min,
                                **algorithm_params)
        if double:
            agent = BootstrappedDoubleQLearning(pi, mdp.info,
                                                **algorithm_params)
        else:
            agent = BootstrappedQLearning(pi, mdp.info, **algorithm_params)
        epsilon_train = Parameter(0)
    elif algorithm == 'particle-ql':
        if policy not in ['weighted', 'vpi']:
            warnings.warn(
                'Particle QL available with only vpi and weighted policies!')
            policy = 'weighted'
        pi = policy_dict[policy](n_approximators=n_approximators)
        algorithm_params = dict(n_approximators=n_approximators,
                                update_mode=update_mode,
                                update_type=update_type,
                                q_max=q_max,
                                q_min=q_min,
                                **algorithm_params)
        if double:
            agent = ParticleDoubleQLearning(pi, mdp.info, **algorithm_params)
        else:
            agent = ParticleQLearning(pi, mdp.info, **algorithm_params)
        epsilon_train = Parameter(0)
    else:
        raise ValueError()

    # Algorithm
    collect_dataset = CollectDataset()
    collect_qs_callback = CollectQs(agent.approximator)
    callbacks = [collect_dataset]
    if collect_qs:
        callbacks += [collect_qs_callback]
    core = Core(agent, mdp, callbacks)

    train_scores = []
    test_scores = []

    for n_epoch in range(1, max_steps // evaluation_frequency + 1):

        # Train
        if hasattr(pi, 'set_epsilon'):
            pi.set_epsilon(epsilon_train)
        if hasattr(pi, 'set_eval'):
            pi.set_eval(False)
        core.learn(n_steps=evaluation_frequency, n_steps_per_fit=1, quiet=True)
        dataset = collect_dataset.get()
        scores = compute_scores(dataset, mdp.info.gamma)

        # print('Train: ', scores)
        train_scores.append(scores)

        collect_dataset.clean()
        mdp.reset()

        if hasattr(pi, 'set_epsilon'):
            pi.set_epsilon(epsilon_test)
        if hasattr(pi, 'set_eval'):
            pi.set_eval(True)
        dataset = core.evaluate(n_steps=test_samples, quiet=True)
        mdp.reset()
        scores = compute_scores(dataset, mdp.info.gamma)
        # print('Evaluation: ', scores)
        test_scores.append(scores)
    if collect_qs:
        qs = collect_qs_callback.get_values()
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
        np.save(out_dir + '/' + file_name, qs)
    return train_scores, test_scores
Example #6
0
def experiment(algorithm,
               name,
               update_mode,
               update_type,
               policy,
               n_approximators,
               q_max,
               q_min,
               lr_exp,
               file_name,
               out_dir,
               particles,
               R=1,
               m=1,
               collect_qs=False,
               seed=0):
    set_global_seeds(seed)
    print('Using seed %s' % seed)

    # MDP
    if name == 'Taxi':
        mdp = generate_taxi('../../grid.txt', horizon=5000)
        max_steps = 500000
        evaluation_frequency = 5000
        test_samples = 5000
    elif name == 'Chain':
        mdp = generate_chain(horizon=100)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'Loop':
        mdp = generate_loop(horizon=100)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'RiverSwim':
        mdp = generate_river(horizon=100)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'SixArms':
        mdp = generate_arms(horizon=100)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'KnightQuest':
        mdp = None
        try:
            mdp = Gym('KnightQuest-v0', gamma=0.99, horizon=10000)
        except:
            register(
                id='KnightQuest-v0',
                entry_point='envs.knight_quest:KnightQuest',
            )
            mdp = Gym('KnightQuest-v0', gamma=0.99, horizon=10000)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    else:
        raise NotImplementedError

    epsilon_test = Parameter(0)

    # Agent
    learning_rate = ExponentialDecayParameter(value=1.,
                                              decay_exp=lr_exp,
                                              size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)

    if algorithm == 'particle-ql':
        delta = 0.1
        if policy not in ['weighted', 'vpi', 'ucb']:
            warnings.warn(
                'Particle QL available with only vpi and weighted policies!')
            policy = 'weighted'
        if policy == 'ucb':
            pi = UCBPolicy(delta=delta, q_max=R / (1 - mdp.info.gamma))
        else:
            pi = policy_dict[policy](n_approximators=n_approximators)
        algorithm_params = dict(n_approximators=n_approximators,
                                update_mode=update_mode,
                                update_type=update_type,
                                q_max=q_max,
                                q_min=q_min,
                                delta=delta,
                                init_values=particles,
                                **algorithm_params)

        agent = ParticleQLearning(pi, mdp.info, **algorithm_params)
        if policy == 'ucb':
            q = agent.approximator
            quantiles = [
                i * 1. / (n_approximators - 1) for i in range(n_approximators)
            ]
            for p in range(n_approximators):
                if quantiles[p] >= 1 - delta:
                    particle_bound = p
                    break

            def quantile_func(state, quantile):
                q_list = list()
                for i in range(n_approximators):
                    q_list.append(q.predict(state, idx=i))
                qs = np.array(q_list)
                out = np.zeros(qs.shape[1])
                out[:] = qs[particle_bound, :]
                return out

            def mu(state):
                q_list = list()
                for i in range(n_approximators):
                    q_list.append(q.predict(state, idx=i))
                qs = np.array(q_list)
                return np.mean(qs, axis=0)

            pi.set_quantile_func(quantile_func)
            pi.set_mu(mu)
        epsilon_train = Parameter(0)
    elif algorithm == 'delayed-ql':
        algorithm_params = dict(R=R, m=m, **algorithm_params)

        agent = DelayedQLearning(mdp.info, **algorithm_params)
        pi = agent

    # Algorithm
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset]
    if collect_qs:
        collect_qs_callback = CollectQs(agent.approximator)
        callbacks += [collect_qs_callback]
    core = Core(agent, mdp, callbacks)

    train_scores = []
    test_scores = []

    for n_epoch in range(1, max_steps // evaluation_frequency + 1):

        # Train
        if hasattr(pi, 'set_epsilon'):
            pi.set_epsilon(epsilon_train)
        if hasattr(pi, 'set_eval'):
            pi.set_eval(False)
        core.learn(n_steps=evaluation_frequency, n_steps_per_fit=1, quiet=True)
        dataset = collect_dataset.get()
        scores = compute_scores(dataset, mdp.info.gamma)

        # print('Train: ', scores)
        train_scores.append(scores)

        collect_dataset.clean()
        mdp.reset()

        if hasattr(pi, 'set_epsilon'):
            pi.set_epsilon(epsilon_test)
        if hasattr(pi, 'set_eval'):
            pi.set_eval(True)
        dataset = core.evaluate(n_steps=test_samples, quiet=True)
        mdp.reset()
        scores = compute_scores(dataset, mdp.info.gamma)
        # print('Evaluation: ', scores)
        test_scores.append(scores)
    if collect_qs:
        qs = collect_qs_callback.get_values()
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
        np.save(out_dir + '/' + file_name, qs)
    return train_scores, test_scores
Example #7
0
File: run.py Project: czgdp1807/wql
def experiment(algorithm, name, update_mode, update_type, policy, n_approximators, q_max, q_min,
               lr_exp, R, log_lr, r_max_m, delayed_m, delayed_epsilon, delta, debug, double,
               regret_test, a, b, mbie_C, value_iterations, tolerance, file_name, out_dir,
               collect_qs,  seed):
    set_global_seeds(seed)
    print('Using seed %s' % seed)
    # MDP
    if name == 'Taxi':
        mdp = generate_taxi('../grid.txt', horizon=5000, gamma=0.99)
        max_steps = 500000
        evaluation_frequency = 5000
        test_samples = 5000
    elif name == 'Chain':
        mdp = generate_chain(horizon=100, gamma=0.99)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'Gridworld':
        mdp = generate_gridworld(horizon=100, gamma=0.99)
        max_steps = 500000
        evaluation_frequency = 5000
        test_samples = 1000
    elif name == 'Loop':
        mdp = generate_loop(horizon=100, gamma=0.99)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'RiverSwim':
        mdp = generate_river(horizon=100, gamma=0.99)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
        mbie_C = 0.4
    elif name == 'SixArms':
        mdp = generate_arms(horizon=100, gamma=0.99)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
        mbie_C = 0.8
    elif name == 'ThreeArms':
        horizon = 100
        mdp = generate_three_arms(horizon=horizon, gamma=0.99)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'KnightQuest':
        mdp = None
        try:
            mdp = Gym('KnightQuest-v0', gamma=0.99, horizon=10000)
        except:
            register(
                id='KnightQuest-v0',
                entry_point='envs.knight_quest:KnightQuest',
            )
            mdp = Gym('KnightQuest-v0', gamma=0.99, horizon=10000)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    else:
        raise NotImplementedError

    epsilon_test = Parameter(0)

    # Agent
    learning_rate = ExponentialDecayParameter(value=1., decay_exp=lr_exp,
                                              size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)
    if regret_test:

        max_steps = int(args.max_steps_regret * 1e6)
        evaluation_frequency = max_steps // 100
        test_samples = 1000
        if debug:
            max_steps = 100000
            evaluation_frequency =  max_steps // 100
            test_samples = 1000
        
    if algorithm == 'ql':
        if policy not in ['boltzmann', 'eps-greedy']:
            warnings.warn('QL available with only boltzmann and eps-greedy policies!')
            policy = 'eps-greedy'

        if policy == 'eps-greedy':
            epsilon_train = ExponentialDecayParameter(value=1., decay_exp=lr_exp,
                                                      size=mdp.info.observation_space.size)
            pi = policy_dict[policy](epsilon=epsilon_train)
        else:
            beta_train = ExponentialDecayParameter(value=1.5 * q_max, decay_exp=.5,
                                                      size=mdp.info.observation_space.size)
            pi = policy_dict[policy](beta=beta_train)
        if double:
            agent = DoubleQLearning(pi, mdp.info, **algorithm_params)
        else:
            agent = QLearning(pi, mdp.info, **algorithm_params)
    elif algorithm == 'boot-ql':
        if policy not in ['boot', 'weighted']:
            warnings.warn('Bootstrapped QL available with only boot and weighted policies!')
            policy = 'boot'
        pi = policy_dict[policy](n_approximators=n_approximators)
        algorithm_params = dict(n_approximators=n_approximators,
                                mu=(q_max + q_min) / 2,
                                sigma=(q_max - q_min)/2,
                                **algorithm_params)
        if double:
            agent = BootstrappedDoubleQLearning(pi, mdp.info, **algorithm_params)
        else:
            agent = BootstrappedQLearning(pi, mdp.info, **algorithm_params)
        epsilon_train = Parameter(0)
    elif algorithm == 'particle-ql':
        if policy not in ['weighted', 'ucb']:
            warnings.warn('Particle QL available with only ucb and weighted policies!')
            policy = 'weighted'
        if policy == 'ucb':
            pi = UCBPolicy(delta=delta, q_max=R/(1-mdp.info.gamma))
        else:
            pi = policy_dict[policy](n_approximators=n_approximators)
        algorithm_params = dict(n_approximators=n_approximators,
                                update_mode=update_mode,
                                update_type=update_type,
                                q_max=q_max,
                                q_min=q_min,
                                delta=delta,
                                **algorithm_params)
        if double:
            agent = ParticleDoubleQLearning(pi, mdp.info, **algorithm_params)
        else:
            agent = ParticleQLearning(pi, mdp.info, **algorithm_params)

        epsilon_train = Parameter(0)
    elif algorithm == 'r-max':
        thr_1 = int(np.ceil((4 * mdp.info.size[0] * 1.0/(1-mdp.info.gamma) * R )**3))

        algorithm_params = dict(
            rmax=R,
            s_a_threshold=r_max_m
        )
        agent = RMaxAgent(mdp.info, **algorithm_params)
        pi = agent
        epsilon_train = Parameter(0)
    elif algorithm == 'mbie':


        algorithm_params = dict(
            rmax=R,
            C=mbie_C,
            value_iterations=value_iterations,
            tolerance=tolerance
        )
        agent = MBIE_EB(mdp.info, **algorithm_params)

        pi = agent
        epsilon_train = Parameter(0)
    elif algorithm == 'delayed-ql':
        theoretic_m = delayed_m
        if regret_test:
            gamma = mdp.info.gamma
            Vmax = R / (1 - gamma)
            epsilon = args.delayed_ratio * Vmax
            delayed_epsilon = epsilon*(1-gamma)
            delta = 0.1
            S, A = mdp.info.size

            theoretic_m = (1 + gamma*Vmax)**2 / (2*delayed_epsilon**2) * np.log(3*S*A/delta * (1 + S*A/(delayed_epsilon*(1-gamma))))
            if debug:
                print("Delta:{}".format(delta))
                print("R:{}".format(R))
                print("Vmax:{}".format(Vmax))
                print("Gamma:{}".format(mdp.info.gamma))
                print("Epsilon:{}".format(epsilon))
                #print("k:{}".format(k))
                print("m:{}".format(theoretic_m))
                print("S:{}".format(S))
                print("A:{}".format(A))
                input()
            def evaluate_policy(P, R, policy):

                P_pi = np.zeros((S, S))
                R_pi = np.zeros(S)

                for s in range(S):
                    for s1 in range(S):
                        P_pi[s,s1] = np.sum(policy[s, :] * P[s, :, s1])
                    R_pi[s] = np.sum(policy[s, :] * np.sum(P[s, :, :] * R[s, :, :], axis=-1))
                I = np.diag(np.ones(S))
                V = np.linalg.solve(I - gamma * P_pi, R_pi)

                return V
        algorithm_params = dict(
            R=R,
            m=theoretic_m,
            delta=delta,
            epsilon=delayed_epsilon,
            **algorithm_params)

        agent = DelayedQLearning(mdp.info, **algorithm_params)
        if regret_test:
            collect_vs_callback = CollectVs(mdp, agent, evaluate_policy, args.freq_collection)
            if debug:
                print("Q:")
                print(agent.get_approximator()[:, :])
                print("Policy:")
                print(agent.get_policy())
                print("V:{}".format(evaluate_policy(mdp.p,mdp.r,agent.get_policy())))
                input()

        pi = agent
        epsilon_train = Parameter(0)
    elif algorithm == 'gaussian-ql':
        if policy not in ['weighted-gaussian', 'ucb']:
            warnings.warn('Particle QL available with only ucb and weighted policies!')
            policy = 'weighted-gaussian'
        if policy == 'ucb':
            pi = UCBPolicy(delta=delta, q_max=R/(1-mdp.info.gamma))
        else:
            pi = policy_dict[policy]()
        q_0 = (q_max - q_min) / 2
        sigma_0 = (q_max - q_min) / np.sqrt(12)
        C = 2 * R / (np.sqrt(2 * np.pi) * (1 - mdp.info.gamma) * sigma_0)
        sigma_lr = None
        if log_lr:
            sigma_lr = LogarithmicDecayParameter(value=1., C=C,
                                             size=mdp.info.size)
        init_values = (q_0, sigma_0)
        if regret_test:
            sigma_lr = None
            gamma = mdp.info.gamma
            T = max_steps
            S, A = mdp.info.size
            a = (2 + gamma) / (2 *(1 - gamma))
            b = a - 1
            c = 1
            d = b
            q_max = R / (1 - gamma)
            standard_bound = norm.ppf(1 - delta, loc=0, scale=1)
            #first_fac = np.sqrt(b + T)
            #second_fac = np.sqrt(a * np.log(S*A*T / delta))
            #sigma2_factor = min(np.sqrt(b + T), np.sqrt(a * np.log(S*A*T / delta)))

            q_0 = q_max
            sigma1_0 = 0
            #sigma2_0 = (R + gamma * q_max) / (standard_bound * np.sqrt(c-1)) * sigma2_factor

            sigma2_0 = (gamma * q_max) / (c * standard_bound) * np.sqrt(a * np.log(S * A * T / delta))
            init_values = (q_0, sigma1_0, sigma2_0)
            learning_rate = TheoreticalParameter(a=a, b=b, decay_exp=1,
                                                 size=mdp.info.size)
            learning_rate_sigma1 = TheoreticalParameter(a=a, b=b, decay_exp=1,
                                                 size=mdp.info.size)
            algorithm_params = dict(learning_rate=learning_rate,
                                    sigma_1_learning_rate=learning_rate_sigma1)

            sigma_lr = BetaParameter(c=c, d=d, size=mdp.info.size)
            def evaluate_policy(P, R, policy):

                P_pi = np.zeros((S, S))
                R_pi = np.zeros(S)

                for s in range(S):
                    for s1 in range(S):
                        P_pi[s,s1] = np.sum(policy[s, :] * P[s, :, s1])

                    R_pi[s] = np.sum(policy[s, :] * np.sum(P[s, :, :] * R[s, :, :],axis=-1))
                I = np.diag(np.ones(S))

                V = np.linalg.solve(I - gamma * P_pi, R_pi)
                return V
            if debug:
                print("Delta:{}".format(delta))
                print("R:{}".format(R))
                print("Gamma:{}".format(mdp.info.gamma))
                print("mu0:{}".format(q_0))
                print("Sigma1_0:{}".format(sigma1_0))
                print("Sigma2_0:{}".format(sigma2_0))
                print("a:{}".format(a))
                print("b:{}".format(b))
                print("c:{}".format(c))
                print("d:{}".format(d))
                print("T:{}".format(T))
                print("S:{}".format(S))
                print("A:{}".format(A))
                input()




        algorithm_params = dict(
            update_mode=update_mode,
            update_type=update_type,
            sigma_learning_rate=sigma_lr,
            init_values=init_values,
            delta=delta,
            q_max=q_max,
            **algorithm_params)
        if double and not regret_test:
            agent = GaussianDoubleQLearning(pi, mdp.info, **algorithm_params)
        else:
            agent = GaussianQLearning(pi, mdp.info, **algorithm_params)
        if regret_test:
            if debug:
                freq = 10
            else:
                freq = args.freq_collection
            collect_vs_callback = CollectVs(mdp, agent, evaluate_policy, freq)
        if debug:
            print("Policy:")
            print(agent.get_policy())
            print("Q")
            for state in range(S):
                means = np.array(agent.approximator.predict(np.array([state]), idx=0))
                sigmas1 = np.array(agent.approximator.predict(np.array([state]), idx=1))
                sigmas2 = np.array(agent.approximator.predict(np.array([state]), idx=2))
                print("Means:{}".format(means))
                print("Sigmas1:{}".format(sigmas1))
                print("Sigmas2:{}".format(sigmas2))
            print("V:{}".format(evaluate_policy(mdp.p,mdp.r,agent.get_policy())))
            input()
        if policy == 'ucb':
            q = agent.approximator
            standard_bound = norm.ppf(1 - delta, loc=0, scale=1)
            def quantile_func(state):
                means = np.array(q.predict(state, idx=0))
                if regret_test:
                    sigmas1 = np.array(q.predict(state, idx=1))
                    sigmas2 = np.array(q.predict(state, idx=2))
                    sigmas = sigmas2
                    #print(sigmas1, sigmas2)
                else:
                    sigmas = np.array(q.predict(state, idx=1))
                out = sigmas * standard_bound + means
                return out

            def mu(state):
                q_list = q.predict(state, idx=0)
                means = np.array(q_list)

                return means
            pi.set_quantile_func(quantile_func)
            pi.set_mu(mu)
        epsilon_train = Parameter(0)
    else:
        raise ValueError()

    # Algorithm
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset]
    if collect_qs:
        if algorithm not in ['r-max']:
            collect_qs_callback = CollectQs(agent.approximator)
            callbacks += [collect_qs_callback]

    if regret_test:
        callbacks += [collect_vs_callback]
    core = Core(agent, mdp, callbacks)

    train_scores = []
    test_scores = []

    for n_epoch in range(1, max_steps // evaluation_frequency + 1):

        # Train
        if hasattr(pi, 'set_epsilon'):
            pi.set_epsilon(epsilon_train)
        if hasattr(pi, 'set_eval'):
            pi.set_eval(False)
        if regret_test:
            collect_vs_callback.on()
        core.learn(n_steps=evaluation_frequency, n_steps_per_fit=1, quiet=True)
        dataset = collect_dataset.get()
        scores = compute_scores(dataset, mdp.info.gamma)

        #print('Train: ', scores)
        train_scores.append(scores)

        collect_dataset.clean()
        mdp.reset()
        if regret_test:
            vs = collect_vs_callback.get_values()
            if not os.path.exists(out_dir):
                os.makedirs(out_dir)
            print("Finished {} steps.".format(n_epoch * evaluation_frequency))
            np.save(out_dir + "/vs_" + algorithm+"_"+str(seed), vs)
            np.save(out_dir+"/scores_online" + str(seed), train_scores)
            collect_vs_callback.off()
        if hasattr(pi, 'set_epsilon'):
            pi.set_epsilon(epsilon_test)
        if hasattr(pi, 'set_eval'):
            pi.set_eval(True)
        dataset = core.evaluate(n_steps=test_samples, quiet=True)
        s = mdp.reset()
        scores = compute_scores(dataset, mdp.info.gamma)
        print('Evaluation #%d:%s ' %(n_epoch, scores))
        if debug:
            print("Policy:")
            print(agent.get_policy())
            print("Q")
            for state in range(S):
                means = np.array(agent.approximator.predict(np.array([state]), idx=0))
                sigmas1 = np.array(agent.approximator.predict(np.array([state]), idx=1))
                sigmas2 = np.array(agent.approximator.predict(np.array([state]), idx=2))
                print("Means:{}".format(means))
                print("Sigmas1:{}".format(sigmas1))
                print("Sigmas2:{}".format(sigmas2))
            print("V:{}".format(evaluate_policy(mdp.p, mdp.r, agent.get_policy())))
            input()
        test_scores.append(scores)
        if regret_test:
            np.save(out_dir + "/scores_offline" + str(seed), test_scores)
    if collect_qs:
        qs= collect_qs_callback.get_values()
        if not os.path.exists(out_dir):
                            os.makedirs(out_dir)
        np.save(out_dir + '/' + file_name, qs)

    return train_scores, test_scores
Example #8
0
def experiment(policy, name, folder_name):
    np.random.seed()

    # Argument parser
    parser = argparse.ArgumentParser()

    arg_mdp = parser.add_argument_group('Environment')
    arg_mdp.add_argument("--horizon", type=int)
    arg_mdp.add_argument("--gamma", type=float)

    arg_mem = parser.add_argument_group('Replay Memory')
    arg_mem.add_argument("--initial-replay-size",
                         type=int,
                         default=100,
                         help='Initial size of the replay memory.')
    arg_mem.add_argument("--max-replay-size",
                         type=int,
                         default=5000,
                         help='Max size of the replay memory.')

    arg_net = parser.add_argument_group('Deep Q-Network')
    arg_net.add_argument("--n-features", type=int, default=80)
    arg_net.add_argument(
        "--optimizer",
        choices=['adadelta', 'adam', 'rmsprop', 'rmspropcentered'],
        default='adam',
        help='Name of the optimizer to use to learn.')
    arg_net.add_argument("--learning-rate",
                         type=float,
                         default=.0001,
                         help='Learning rate value of the optimizer. Only used'
                         'in rmspropcentered')
    arg_net.add_argument("--decay",
                         type=float,
                         default=.95,
                         help='Discount factor for the history coming from the'
                         'gradient momentum in rmspropcentered')
    arg_net.add_argument("--epsilon",
                         type=float,
                         default=.01,
                         help='Epsilon term used in rmspropcentered')

    arg_alg = parser.add_argument_group('Algorithm')
    arg_alg.add_argument("--weighted-update", action='store_true')
    arg_alg.add_argument(
        "--n-approximators",
        type=int,
        default=10,
        help="Number of approximators used in the ensemble for"
        "Averaged DQN.")
    arg_alg.add_argument("--batch-size",
                         type=int,
                         default=100,
                         help='Batch size for each fit of the network.')
    arg_alg.add_argument("--history-length",
                         type=int,
                         default=1,
                         help='Number of frames composing a state.')
    arg_alg.add_argument("--target-update-frequency",
                         type=int,
                         default=100,
                         help='Number of collected samples before each update'
                         'of the target network.')
    arg_alg.add_argument("--evaluation-frequency",
                         type=int,
                         default=1000,
                         help='Number of learning step before each evaluation.'
                         'This number represents an epoch.')
    arg_alg.add_argument("--train-frequency",
                         type=int,
                         default=1,
                         help='Number of learning steps before each fit of the'
                         'neural network.')
    arg_alg.add_argument("--max-steps",
                         type=int,
                         default=50000,
                         help='Total number of learning steps.')
    arg_alg.add_argument(
        "--final-exploration-frame",
        type=int,
        default=1,
        help='Number of steps until the exploration rate stops'
        'decreasing.')
    arg_alg.add_argument("--initial-exploration-rate",
                         type=float,
                         default=0.,
                         help='Initial value of the exploration rate.')
    arg_alg.add_argument("--final-exploration-rate",
                         type=float,
                         default=0.,
                         help='Final value of the exploration rate. When it'
                         'reaches this values, it stays constant.')
    arg_alg.add_argument("--test-exploration-rate",
                         type=float,
                         default=0.,
                         help='Exploration rate used during evaluation.')
    arg_alg.add_argument("--test-samples",
                         type=int,
                         default=1000,
                         help='Number of steps for each evaluation.')
    arg_alg.add_argument("--max-no-op-actions",
                         type=int,
                         default=0,
                         help='Maximum number of no-op action performed at the'
                         'beginning of the episodes. The minimum number is'
                         'history_length.')
    arg_alg.add_argument("--no-op-action-value",
                         type=int,
                         default=0,
                         help='Value of the no-op action.')
    arg_alg.add_argument("--p-mask", type=float, default=1.)

    arg_utils = parser.add_argument_group('Utils')
    arg_utils.add_argument('--load-path',
                           type=str,
                           help='Path of the model to be loaded.')
    arg_utils.add_argument('--save',
                           action='store_true',
                           help='Flag specifying whether to save the model.')
    arg_utils.add_argument('--render',
                           action='store_true',
                           help='Flag specifying whether to render the game.')
    arg_utils.add_argument('--quiet',
                           action='store_true',
                           help='Flag specifying whether to hide the progress'
                           'bar.')
    arg_utils.add_argument('--debug',
                           action='store_true',
                           help='Flag specifying whether the script has to be'
                           'run in debug mode.')

    args = parser.parse_args()

    scores = list()

    # Evaluation of the model provided by the user.
    if args.load_path:
        # MDP
        if name != 'Taxi':
            mdp = Gym(name, args.horizon, args.gamma)
            n_states = None
            gamma_eval = 1.
        else:
            mdp = generate_taxi('../../grid.txt')
            n_states = mdp.info.observation_space.size[0]
            gamma_eval = mdp.info.gamma

        # Policy
        epsilon_test = Parameter(value=args.test_exploration_rate)
        pi = BootPolicy(args.n_approximators, epsilon=epsilon_test)

        # Approximator
        input_shape = mdp.info.observation_space.shape + (
            args.history_length, )
        input_preprocessor = list()
        approximator_params = dict(input_shape=input_shape,
                                   output_shape=(mdp.info.action_space.n, ),
                                   n_states=n_states,
                                   n_actions=mdp.info.action_space.n,
                                   n_features=args.n_features,
                                   n_approximators=args.n_approximators,
                                   input_preprocessor=input_preprocessor,
                                   name='test',
                                   load_path=args.load_path,
                                   optimizer={
                                       'name': args.optimizer,
                                       'lr': args.learning_rate,
                                       'decay': args.decay,
                                       'epsilon': args.epsilon
                                   })

        approximator = SimpleNet

        # Agent
        algorithm_params = dict(batch_size=0,
                                initial_replay_size=0,
                                max_replay_size=0,
                                history_length=1,
                                clip_reward=False,
                                n_approximators=args.n_approximators,
                                train_frequency=1,
                                target_update_frequency=1,
                                max_no_op_actions=args.max_no_op_actions,
                                no_op_action_value=args.no_op_action_value,
                                p_mask=args.p_mask,
                                weighted_update=args.weighted_update)
        agent = DoubleDQN(approximator,
                          pi,
                          mdp.info,
                          approximator_params=approximator_params,
                          **algorithm_params)

        # Algorithm
        core_test = Core(agent, mdp)

        # Evaluate model
        pi.set_eval(True)
        dataset = core_test.evaluate(n_steps=args.test_samples,
                                     render=args.render,
                                     quiet=args.quiet)
        get_stats(dataset, gamma_eval)
    else:
        # DQN learning run

        # Settings
        if args.debug:
            initial_replay_size = 50
            max_replay_size = 500
            train_frequency = 5
            target_update_frequency = 10
            test_samples = 20
            evaluation_frequency = 50
            max_steps = 1000
        else:
            initial_replay_size = args.initial_replay_size
            max_replay_size = args.max_replay_size
            train_frequency = args.train_frequency
            target_update_frequency = args.target_update_frequency
            test_samples = args.test_samples
            evaluation_frequency = args.evaluation_frequency
            max_steps = args.max_steps

        # MDP
        if name != 'Taxi':
            mdp = Gym(name, args.horizon, args.gamma)
            n_states = None
            gamma_eval = 1.
        else:
            mdp = generate_taxi('../../grid.txt')
            n_states = mdp.info.observation_space.size[0]
            gamma_eval = mdp.info.gamma

        # Policy
        epsilon = LinearDecayParameter(value=args.initial_exploration_rate,
                                       min_value=args.final_exploration_rate,
                                       n=args.final_exploration_frame)
        epsilon_test = Parameter(value=args.test_exploration_rate)
        epsilon_random = Parameter(value=1.)

        if policy == 'boot':
            pi = BootPolicy(args.n_approximators, epsilon=epsilon_random)
        elif policy == 'weighted':
            pi = WeightedPolicy(args.n_approximators, epsilon=epsilon_random)
        else:
            raise ValueError

        # Approximator
        input_shape = mdp.info.observation_space.shape + (
            args.history_length, )
        input_preprocessor = list()
        approximator_params = dict(input_shape=input_shape,
                                   output_shape=(mdp.info.action_space.n, ),
                                   n_states=n_states,
                                   n_actions=mdp.info.action_space.n,
                                   n_features=args.n_features,
                                   n_approximators=args.n_approximators,
                                   input_preprocessor=input_preprocessor,
                                   folder_name=folder_name,
                                   optimizer={
                                       'name': args.optimizer,
                                       'lr': args.learning_rate,
                                       'decay': args.decay,
                                       'epsilon': args.epsilon
                                   })

        approximator = SimpleNet

        # Agent
        algorithm_params = dict(
            batch_size=args.batch_size,
            initial_replay_size=initial_replay_size,
            max_replay_size=max_replay_size,
            history_length=args.history_length,
            clip_reward=False,
            n_approximators=args.n_approximators,
            train_frequency=train_frequency,
            target_update_frequency=target_update_frequency,
            max_no_op_actions=args.max_no_op_actions,
            no_op_action_value=args.no_op_action_value,
            p_mask=args.p_mask,
            weighted_update=args.weighted_update)

        agent = DoubleDQN(approximator,
                          pi,
                          mdp.info,
                          approximator_params=approximator_params,
                          **algorithm_params)

        # Algorithm
        core = Core(agent, mdp)
        core_test = Core(agent, mdp)

        # RUN

        # Fill replay memory with random dataset
        print_epoch(0)
        core.learn(n_steps=initial_replay_size,
                   n_steps_per_fit=initial_replay_size,
                   quiet=args.quiet)

        if args.save:
            agent.approximator.model.save()

        # Evaluate initial policy
        pi.set_eval(True)
        pi.set_epsilon(epsilon_test)
        dataset = core_test.evaluate(n_steps=test_samples,
                                     render=args.render,
                                     quiet=args.quiet)
        scores.append(get_stats(dataset, gamma_eval))

        for n_epoch in range(1, max_steps // evaluation_frequency + 1):
            print_epoch(n_epoch)
            print('- Learning:')
            pi.set_eval(False)
            pi.set_epsilon(epsilon)
            # learning step
            core.learn(n_steps=evaluation_frequency,
                       n_steps_per_fit=train_frequency,
                       quiet=args.quiet)

            if args.save:
                agent.approximator.model.save()

            print('- Evaluation:')
            # evaluation step
            core_test.reset()
            pi.set_eval(True)
            pi.set_epsilon(epsilon_test)
            dataset = core_test.evaluate(n_steps=test_samples,
                                         render=args.render,
                                         quiet=args.quiet)
            scores.append(get_stats(dataset, gamma_eval))

    return scores