Esempio n. 1
0
def experiment(p):
    # Parameters
    gamma = .9
    n_env = 5
    size = p['size']
    env_distribution = make_env_distribution(
        env_class='tight', n_env=n_env, gamma=gamma,
        env_name=p['name'],
        w=size,
        h=size,
        stochastic=p['stochastic']
    )
    actions = env_distribution.get_actions()
    n_known = p['n_known']
    p_min = 1. / float(n_env)
    epsilon_q = .01
    epsilon_m = .01
    delta = .1
    r_max = 1.
    v_max = p['v_max']
    n_states = 4
    max_mem = 1

    # Agents
    rmax = RMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known,
                deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, name='RMax')
    lrmax = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known,
                  deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states,
                  max_memory_size=max_mem, prior=None, estimate_distances_online=True,
                  min_sampling_probability=p_min, name='LRMax')
    lrmax_p01 = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known,
                      deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states,
                      max_memory_size=max_mem, prior=0.1, estimate_distances_online=True,
                      min_sampling_probability=p_min, name='LRMax(Dmax=0.1)')
    lrmax_p02 = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known,
                      deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states,
                      max_memory_size=max_mem, prior=0.2, estimate_distances_online=True,
                      min_sampling_probability=p_min, name='LRMax(Dmax=0.2)')
    maxqinit = MaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known,
                        deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states,
                        min_sampling_probability=p_min, name='MaxQInit')
    lrmaxqinit = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known,
                            deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta,
                            n_states=n_states, max_memory_size=max_mem, prior=None, estimate_distances_online=True,
                            min_sampling_probability=p_min, name='LRMaxQInit')
    lrmaxqinit_p01 = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known,
                                deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta,
                                n_states=n_states, max_memory_size=max_mem, prior=0.1, estimate_distances_online=True,
                                min_sampling_probability=p_min, name='LRMaxQInit(Dmax=0.1)')
    lrmaxqinit_p02 = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known,
                                deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta,
                                n_states=n_states, max_memory_size=max_mem, prior=0.2, estimate_distances_online=True,
                                min_sampling_probability=p_min, name='LRMaxQInit(Dmax=0.2)')
    agents_pool = [rmax, lrmax, lrmax_p01, lrmax_p02, maxqinit, lrmaxqinit, lrmaxqinit_p01, lrmaxqinit_p02]

    # Run
    run_agents_lifelong(agents_pool, env_distribution, n_instances=3, n_tasks=p['n_tasks'], n_episodes=p['n_episodes'],
                        n_steps=p['n_steps'],
                        reset_at_terminal=False, open_plot=False, plot_title=True, do_run=True, do_plot=True,
                        parallel_run=True, n_processes=None)
Esempio n. 2
0
    def reset(self):
        """
        Reset the attributes to initial state (called between instances).
        Save the previous model.
        :return: None
        """
        LRMax.reset(self)

        n_bound_use = self.n_rmax + self.n_lip
        if n_bound_use > 0:
            # Save ratio
            ratio_rmax_bound_use = self.n_rmax / n_bound_use
            ratio_lip_bound_use = self.n_lip / n_bound_use
            if self.write_data:
                self.write(init=False,
                           ratio_rmax_bound_use=ratio_rmax_bound_use,
                           ratio_lip_bound_use=ratio_lip_bound_use)

            # Reset
            self.n_rmax = 0
            self.n_lip = 0

        # Reset recorded variables between MDPs
        self.discounted_return = 0.
        self.total_return = 0.
        self.n_time_steps = 0
        self.update_time_steps = []
Esempio n. 3
0
def experiment():
    n_env = 5
    env_distribution = make_env_distribution(env_class='maze-mono-goal', env_name='maze-mono-goal', n_env=n_env, gamma=GAMMA)
    actions = env_distribution.get_actions()
    p_min = 1. / float(n_env)
    delta = .1

    m = 100
    max_mem = 10
    rmax = RMax(actions=actions, gamma=GAMMA, count_threshold=m)
    rmax_q = MaxQInit(actions=actions, gamma=GAMMA, count_threshold=m, min_sampling_probability=p_min, delta=delta)
    lrmax1 = LRMax(actions=actions, gamma=GAMMA, count_threshold=m, max_memory_size=max_mem, prior=1.)
    lrmax05 = LRMax(actions=actions, gamma=GAMMA, count_threshold=m, max_memory_size=max_mem, prior=0.5)
    lrmax02 = LRMax(actions=actions, gamma=GAMMA, count_threshold=m, max_memory_size=max_mem, prior=0.2)
    lrmax_learn = LRMax(
        actions=actions, gamma=GAMMA, count_threshold=m, max_memory_size=max_mem, prior=None,
        min_sampling_probability=p_min, delta=delta
    )

    agents_pool = [rmax, lrmax1, lrmax05, lrmax02, lrmax_learn, rmax_q]

    run_agents_lifelong(
        agents_pool, env_distribution, samples=20, episodes=100, steps=1000, reset_at_terminal=False,
        open_plot=True, cumulative_plot=False, is_tracked_value_discounted=True, plot_only=False, plot_title=False
    )
Esempio n. 4
0
 def reset(self):
     """
     Reset the attributes to initial state (called between instances).
     Save the previous model.
     :return: None
     """
     self.update_sa_memory()
     LRMax.reset(self)
Esempio n. 5
0
    def __init__(self,
                 actions,
                 gamma=.9,
                 r_max=1.,
                 v_max=None,
                 deduce_v_max=True,
                 n_known=None,
                 deduce_n_known=True,
                 epsilon_q=0.1,
                 epsilon_m=None,
                 delta=None,
                 n_states=None,
                 max_memory_size=None,
                 prior=None,
                 estimate_distances_online=True,
                 min_sampling_probability=.1,
                 name="ExpLRMax",
                 path='results/'):
        LRMax.__init__(self,
                       actions=actions,
                       gamma=gamma,
                       r_max=r_max,
                       v_max=v_max,
                       deduce_v_max=deduce_v_max,
                       n_known=n_known,
                       deduce_n_known=deduce_n_known,
                       epsilon_q=epsilon_q,
                       epsilon_m=epsilon_m,
                       delta=delta,
                       n_states=n_states,
                       max_memory_size=max_memory_size,
                       prior=prior,
                       estimate_distances_online=estimate_distances_online,
                       min_sampling_probability=min_sampling_probability,
                       name=name)

        # Counters used for experiments (not useful to the algorithm)
        self.n_rmax = 0  # number of times the rmax bound is used for all the updates of 1 task
        self.n_lip = 0  # number of times the lipschitz bound is used for all the updates of 1 task

        # Counter for prior use
        self.n_prior_use = 0  # number of times the prior is used for each update of 1 task
        self.n_dista_use = 0  # number of times the distance is used for each update of 1 task

        # Recorded variables
        self.discounted_return = 0.
        self.total_return = 0.
        self.n_time_steps = 0  # number of time steps

        self.path = path
        self.write_data = False  # Enable data writing
        self.instance_number = 0
        self.run_number = 0
Esempio n. 6
0
    def __init__(
            self,
            actions,
            gamma=.9,
            r_max=1.,
            v_max=None,
            deduce_v_max=True,
            n_known=None,
            deduce_n_known=True,
            epsilon_q=0.1,
            epsilon_m=None,
            delta=None,
            n_states=None,
            max_memory_size=None,
            prior=None,
            estimate_distances_online=True,
            min_sampling_probability=.1,
            name="LRMaxQInit"
    ):
        """
        :param actions: action space of the environment
        :param gamma: (float) discount factor
        :param r_max: (float) known upper-bound on the reward function
        :param v_max: (float) known upper-bound on the value function
        :param deduce_v_max: (bool) set to True to deduce v_max from r_max
        :param n_known: (int) count after which a state-action pair is considered known
        (only set n_known if delta and epsilon are not defined)
        :param deduce_n_known: (bool) set to True to deduce n_known from (delta, n_states, epsilon_m)
        :param epsilon_q: (float) precision of value iteration algorithm for Q-value computation
        :param epsilon_m: (float) precision of the learned models in L1 norm
        :param delta: (float) models are learned epsilon_m-closely with probability at least 1 - delta
        :param n_states: (int) number of states

        :param max_memory_size: (int) maximum number of saved models (infinity if None)
        :param prior: (float) prior knowledge of maximum model's distance
        :param estimate_distances_online: (bool) set to True for online estimation of a tighter upper-bound for the
        model pseudo-distances. The estimation is valid with high probability.
        :param min_sampling_probability: (float) minimum sampling probability of an environment
        :param name: (str)
        """
        self.name = name
        self.n_required_tasks = mqi.number_of_tasks_for_high_confidence_upper_bound(delta, min_sampling_probability)
        self.maxQ_memory = []  # Upper-bounds on the Q-values of previous MDPs

        LRMax.__init__(self, actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=deduce_v_max,
                       n_known=n_known, deduce_n_known=deduce_n_known, epsilon_q=epsilon_q, epsilon_m=epsilon_m,
                       delta=delta, n_states=n_states, max_memory_size=max_memory_size, prior=prior,
                       estimate_distances_online=estimate_distances_online,
                       min_sampling_probability=min_sampling_probability, name=name)
def experiment():
    # Parameters
    gamma = .9
    env_distribution = make_env_distribution(env_class='deterministic-super-tight',
                                             env_name='deterministic-super-tight-bignknown',
                                             gamma=gamma)
    actions = env_distribution.get_actions()
    n_known = 100
    p_min = 1. / 3.
    epsilon_q = .01
    epsilon_m = .01
    delta = .1
    r_max = 1.
    v_max = 1.
    n_states = 4
    max_mem = 9

    # Agents
    rmax = RMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known,
                deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, name='RMax')
    lrmax = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known,
                  deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states,
                  max_memory_size=max_mem, prior=None, estimate_distances_online=True,
                  min_sampling_probability=p_min, name='LRMax')
    lrmaxprior = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known,
                       deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states,
                       max_memory_size=max_mem, prior=0.1, estimate_distances_online=True,
                       min_sampling_probability=p_min, name='LRMax(Dmax=0.1)')
    maxqinit = MaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known,
                        deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states,
                        min_sampling_probability=p_min, name='MaxQInit')
    lrmaxqinit = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known,
                            deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta,
                            n_states=n_states, max_memory_size=max_mem, prior=None, estimate_distances_online=True,
                            min_sampling_probability=p_min, name='LRMaxQInit')
    lrmaxqinitprior = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known,
                                 deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta,
                                 n_states=n_states, max_memory_size=max_mem, prior=0.1, estimate_distances_online=True,
                                 min_sampling_probability=p_min, name='LRMaxQInit(Dmax=0.1)')
    agents_pool = [rmax, lrmax, lrmaxprior, maxqinit, lrmaxqinit, lrmaxqinitprior]

    # Run
    run_agents_lifelong(agents_pool, env_distribution, n_instances=1, n_tasks=100, n_episodes=200, n_steps=100,
                        reset_at_terminal=False, open_plot=False, plot_title=True, do_run=False, do_plot=True,
                        parallel_run=True, n_processes=None)
Esempio n. 8
0
    def __init__(self,
                 actions,
                 gamma=.9,
                 r_max=1.,
                 v_max=None,
                 deduce_v_max=True,
                 n_known=None,
                 deduce_n_known=True,
                 epsilon_q=0.1,
                 epsilon_m=None,
                 delta=None,
                 n_states=None,
                 max_memory_size=None,
                 prior=None,
                 estimate_distances_online=True,
                 min_sampling_probability=.1,
                 name="ExpLRMax"):
        """
        See LRMax class.
        """
        LRMax.__init__(self,
                       actions=actions,
                       gamma=gamma,
                       r_max=r_max,
                       v_max=v_max,
                       deduce_v_max=deduce_v_max,
                       n_known=n_known,
                       deduce_n_known=deduce_n_known,
                       epsilon_q=epsilon_q,
                       epsilon_m=epsilon_m,
                       delta=delta,
                       n_states=n_states,
                       max_memory_size=max_memory_size,
                       prior=prior,
                       estimate_distances_online=estimate_distances_online,
                       min_sampling_probability=min_sampling_probability,
                       name=name)

        self.time_step = 0
        self.time_step_counter = []

        self.data = {'n_computation': [0], 'n_prior_use': [0]}
Esempio n. 9
0
    def reset(self):
        """
        Reset the attributes to initial state (called between instances).
        Save the previous model.
        :return: None
        """
        LRMax.reset(self)
        '''
        n_bound_use = self.n_rmax + self.n_lip
        if n_bound_use > 0:
            if self.write_data:
                self.write(init=False)
        '''
        # Reset counters
        self.reset_counters()

        # Reset recorded variables between MDPs
        self.discounted_return = 0.
        self.total_return = 0.
        self.n_time_steps = 0
Esempio n. 10
0
def experiment():
    # Parameters
    gamma = .9
    env_distribution = make_env_distribution(env_class='stochastic-tight', env_name='stochastic-tight', gamma=gamma)
    actions = env_distribution.get_actions()
    n_known = 10
    p_min = 1. / 7.  # There are seven possible MDPs
    epsilon_q = .1
    epsilon_m = .01
    delta = .1
    r_max = 1.
    v_max = 1.
    n_states = 4
    max_mem = 10

    # Agents
    rmax = RMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known,
                deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, name='RMax')
    lrmax = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known,
                  deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states,
                  max_memory_size=max_mem, prior=None, estimate_distances_online=True,
                  min_sampling_probability=p_min, name='LRMax')
    lrmaxprior = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known,
                       deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states,
                       max_memory_size=max_mem, prior=0.2, estimate_distances_online=True,
                       min_sampling_probability=p_min, name='LRMax(Dmax=0.2)')
    maxqinit = MaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known,
                        deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states,
                        min_sampling_probability=p_min, name='MaxQInit')
    lrmaxqinit = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known,
                            deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta,
                            n_states=n_states, max_memory_size=max_mem, prior=None, estimate_distances_online=True,
                            min_sampling_probability=p_min, name='LRMaxQInit')
    agents_pool = [rmax, lrmax, lrmaxprior, maxqinit]  # , lrmaxqinit]

    # Run
    run_agents_lifelong(agents_pool, env_distribution, n_instances=5, n_tasks=50, n_episodes=50, n_steps=100,
                        reset_at_terminal=False, plot_only=False, open_plot=True, plot_title=True)
Esempio n. 11
0
def example():
    n_env = 4
    env_distribution = make_env_distribution(env_class='test',
                                             n_env=n_env,
                                             gamma=GAMMA,
                                             w=60,
                                             h=20)
    actions = env_distribution.get_actions()

    m = 1  # Count threshold
    max_mem = None
    p_min = 1. / float(n_env)
    delta = 0.99
    lrmax = LRMax(actions=actions,
                  gamma=GAMMA,
                  count_threshold=m,
                  max_memory_size=max_mem,
                  prior=None,
                  min_sampling_probability=p_min,
                  delta=delta)
    rmax_max_q_init = MaxQInit(actions=actions,
                               gamma=GAMMA,
                               count_threshold=m,
                               min_sampling_probability=p_min,
                               delta=delta)
    rmax = RMax(actions=actions, gamma=GAMMA, count_threshold=m)

    run_agents_lifelong([rmax_max_q_init, lrmax, rmax],
                        env_distribution,
                        samples=10,
                        episodes=10,
                        steps=100,
                        reset_at_terminal=False,
                        open_plot=True,
                        cumulative_plot=False,
                        is_tracked_value_discounted=True,
                        plot_only=False)
Esempio n. 12
0
def experiment():
    # Parameters
    gamma = .9
    n_env = 5
    n_states = 20
    env_distribution = make_env_distribution(env_class='corridor',
                                             n_env=n_env,
                                             gamma=gamma,
                                             w=n_states,
                                             h=1)
    actions = env_distribution.get_actions()
    n_known = 1
    p_min = 1. / float(n_env)
    r_max = 1.
    v_max = 10.
    epsilon_q = .01
    epsilon_m = .01
    delta = .1
    max_mem = 1

    # Agents
    rmax = RMax(actions=actions,
                gamma=gamma,
                r_max=r_max,
                v_max=v_max,
                deduce_v_max=False,
                n_known=n_known,
                deduce_n_known=False,
                epsilon_q=epsilon_q,
                epsilon_m=epsilon_m,
                name='RMax')
    lrmax = LRMax(actions=actions,
                  gamma=gamma,
                  r_max=r_max,
                  v_max=v_max,
                  deduce_v_max=False,
                  n_known=n_known,
                  deduce_n_known=False,
                  epsilon_q=epsilon_q,
                  epsilon_m=epsilon_m,
                  delta=delta,
                  n_states=n_states,
                  max_memory_size=max_mem,
                  prior=None,
                  estimate_distances_online=True,
                  min_sampling_probability=p_min,
                  name='LRMax')
    lrmaxprior02 = LRMax(actions=actions,
                         gamma=gamma,
                         r_max=r_max,
                         v_max=v_max,
                         deduce_v_max=False,
                         n_known=n_known,
                         deduce_n_known=False,
                         epsilon_q=epsilon_q,
                         epsilon_m=epsilon_m,
                         delta=delta,
                         n_states=n_states,
                         max_memory_size=max_mem,
                         prior=0.2,
                         estimate_distances_online=False,
                         min_sampling_probability=p_min,
                         name='LRMax(0.2)')
    maxqinit = MaxQInit(actions=actions,
                        gamma=gamma,
                        r_max=r_max,
                        v_max=v_max,
                        deduce_v_max=False,
                        n_known=n_known,
                        deduce_n_known=False,
                        epsilon_q=epsilon_q,
                        epsilon_m=epsilon_m,
                        delta=delta,
                        n_states=n_states,
                        min_sampling_probability=p_min,
                        name='MaxQInit')
    lrmaxqinit = LRMaxQInit(actions=actions,
                            gamma=gamma,
                            r_max=r_max,
                            v_max=v_max,
                            deduce_v_max=False,
                            n_known=n_known,
                            deduce_n_known=False,
                            epsilon_q=epsilon_q,
                            epsilon_m=epsilon_m,
                            delta=delta,
                            n_states=n_states,
                            max_memory_size=max_mem,
                            prior=None,
                            estimate_distances_online=True,
                            min_sampling_probability=p_min,
                            name='LRMaxQInit')
    lrmaxqinitprior02 = LRMaxQInit(actions=actions,
                                   gamma=gamma,
                                   r_max=r_max,
                                   v_max=v_max,
                                   deduce_v_max=False,
                                   n_known=n_known,
                                   deduce_n_known=False,
                                   epsilon_q=epsilon_q,
                                   epsilon_m=epsilon_m,
                                   delta=delta,
                                   n_states=n_states,
                                   max_memory_size=max_mem,
                                   prior=0.2,
                                   estimate_distances_online=True,
                                   min_sampling_probability=p_min,
                                   name='LRMaxQInit(0.2)')
    agents_pool = [
        rmax, lrmax, lrmaxprior02, maxqinit, lrmaxqinit, lrmaxqinitprior02
    ]

    # Run
    run_agents_lifelong(agents_pool,
                        env_distribution,
                        name_identifier=None,
                        n_instances=1,
                        n_tasks=20,
                        n_episodes=20,
                        n_steps=11,
                        reset_at_terminal=False,
                        do_run=False,
                        do_plot=True,
                        open_plot=False,
                        episodes_moving_average=False,
                        episodes_ma_width=10,
                        tasks_moving_average=False,
                        tasks_ma_width=10,
                        latex_rendering=True,
                        plot_title=False)
Esempio n. 13
0
def experiment(p, name):
    # Parameters
    gamma = .9
    n_env = 5
    size = p['size']
    env_distribution = make_env_distribution(env_class='tight',
                                             n_env=n_env,
                                             gamma=gamma,
                                             env_name=name,
                                             version=p['version'],
                                             w=size,
                                             h=size,
                                             stochastic=p['stochastic'],
                                             verbose=False)
    actions = env_distribution.get_actions()
    n_known = p['n_known']
    p_min = 1. / n_env
    epsilon_q = .01
    epsilon_m = .01
    delta = .1
    r_max = 1.
    v_max = 10.
    n_states = 4
    max_mem = 1

    # Agents
    rmax = RMax(actions=actions,
                gamma=gamma,
                r_max=r_max,
                v_max=v_max,
                deduce_v_max=False,
                n_known=n_known,
                deduce_n_known=False,
                epsilon_q=epsilon_q,
                epsilon_m=epsilon_m,
                name='RMax')
    lrmax = LRMax(actions=actions,
                  gamma=gamma,
                  r_max=r_max,
                  v_max=v_max,
                  deduce_v_max=False,
                  n_known=n_known,
                  deduce_n_known=False,
                  epsilon_q=epsilon_q,
                  epsilon_m=epsilon_m,
                  delta=delta,
                  n_states=n_states,
                  max_memory_size=max_mem,
                  prior=None,
                  estimate_distances_online=True,
                  min_sampling_probability=p_min,
                  name='LRMax')
    lrmax_p01 = LRMax(actions=actions,
                      gamma=gamma,
                      r_max=r_max,
                      v_max=v_max,
                      deduce_v_max=False,
                      n_known=n_known,
                      deduce_n_known=False,
                      epsilon_q=epsilon_q,
                      epsilon_m=epsilon_m,
                      delta=delta,
                      n_states=n_states,
                      max_memory_size=max_mem,
                      prior=0.1,
                      estimate_distances_online=True,
                      min_sampling_probability=p_min,
                      name='LRMax(0.1)')
    lrmax_p015 = LRMax(actions=actions,
                       gamma=gamma,
                       r_max=r_max,
                       v_max=v_max,
                       deduce_v_max=False,
                       n_known=n_known,
                       deduce_n_known=False,
                       epsilon_q=epsilon_q,
                       epsilon_m=epsilon_m,
                       delta=delta,
                       n_states=n_states,
                       max_memory_size=max_mem,
                       prior=0.15,
                       estimate_distances_online=True,
                       min_sampling_probability=p_min,
                       name='LRMax(0.15)')
    lrmax_p02 = LRMax(actions=actions,
                      gamma=gamma,
                      r_max=r_max,
                      v_max=v_max,
                      deduce_v_max=False,
                      n_known=n_known,
                      deduce_n_known=False,
                      epsilon_q=epsilon_q,
                      epsilon_m=epsilon_m,
                      delta=delta,
                      n_states=n_states,
                      max_memory_size=max_mem,
                      prior=0.2,
                      estimate_distances_online=True,
                      min_sampling_probability=p_min,
                      name='LRMax(0.2)')
    maxqinit = MaxQInit(actions=actions,
                        gamma=gamma,
                        r_max=r_max,
                        v_max=v_max,
                        deduce_v_max=False,
                        n_known=n_known,
                        deduce_n_known=False,
                        epsilon_q=epsilon_q,
                        epsilon_m=epsilon_m,
                        delta=delta,
                        n_states=n_states,
                        min_sampling_probability=p_min,
                        name='MaxQInit')
    lrmaxqinit = LRMaxQInit(actions=actions,
                            gamma=gamma,
                            r_max=r_max,
                            v_max=v_max,
                            deduce_v_max=False,
                            n_known=n_known,
                            deduce_n_known=False,
                            epsilon_q=epsilon_q,
                            epsilon_m=epsilon_m,
                            delta=delta,
                            n_states=n_states,
                            max_memory_size=max_mem,
                            prior=None,
                            estimate_distances_online=True,
                            min_sampling_probability=p_min,
                            name='LRMaxQInit')
    lrmaxqinit_p01 = LRMaxQInit(actions=actions,
                                gamma=gamma,
                                r_max=r_max,
                                v_max=v_max,
                                deduce_v_max=False,
                                n_known=n_known,
                                deduce_n_known=False,
                                epsilon_q=epsilon_q,
                                epsilon_m=epsilon_m,
                                delta=delta,
                                n_states=n_states,
                                max_memory_size=max_mem,
                                prior=0.1,
                                estimate_distances_online=True,
                                min_sampling_probability=p_min,
                                name='LRMaxQInit(0.1)')
    lrmaxqinit_p015 = LRMaxQInit(actions=actions,
                                 gamma=gamma,
                                 r_max=r_max,
                                 v_max=v_max,
                                 deduce_v_max=False,
                                 n_known=n_known,
                                 deduce_n_known=False,
                                 epsilon_q=epsilon_q,
                                 epsilon_m=epsilon_m,
                                 delta=delta,
                                 n_states=n_states,
                                 max_memory_size=max_mem,
                                 prior=0.15,
                                 estimate_distances_online=True,
                                 min_sampling_probability=p_min,
                                 name='LRMaxQInit(0.15)')
    lrmaxqinit_p02 = LRMaxQInit(actions=actions,
                                gamma=gamma,
                                r_max=r_max,
                                v_max=v_max,
                                deduce_v_max=False,
                                n_known=n_known,
                                deduce_n_known=False,
                                epsilon_q=epsilon_q,
                                epsilon_m=epsilon_m,
                                delta=delta,
                                n_states=n_states,
                                max_memory_size=max_mem,
                                prior=0.2,
                                estimate_distances_online=True,
                                min_sampling_probability=p_min,
                                name='LRMaxQInit(0.2)')
    # agents_pool = [rmax, lrmax, lrmax_p01, lrmax_p015, lrmax_p02, maxqinit, lrmaxqinit, lrmaxqinit_p01, lrmaxqinit_p015, lrmaxqinit_p02]
    agents_pool = [
        rmax, lrmax, lrmax_p02, lrmax_p01, maxqinit, lrmaxqinit, lrmaxqinit_p01
    ]

    # Run
    run_agents_lifelong(agents_pool,
                        env_distribution,
                        n_instances=2,
                        n_tasks=p['n_tasks'],
                        n_episodes=p['n_episodes'],
                        n_steps=p['n_steps'],
                        reset_at_terminal=False,
                        open_plot=False,
                        plot_title=False,
                        plot_legend=2,
                        do_run=True,
                        do_plot=True,
                        parallel_run=True,
                        n_processes=None,
                        episodes_moving_average=True,
                        episodes_ma_width=100,
                        tasks_moving_average=False,
                        latex_rendering=True)
Esempio n. 14
0
def experiment():
    # Parameters
    gamma = .9
    n_env = 5
    w, h = 20, 20
    n_states = w * h
    env_distribution = make_env_distribution(
        env_class='grid-world',
        env_name='grid-world-two-goals-large',
        n_env=n_env,
        gamma=gamma,
        w=w,
        h=h)
    actions = env_distribution.get_actions()
    n_known = 1
    p_min = 1. / float(n_env)
    r_max = 1.
    v_max = 10.
    epsilon_q = .01
    epsilon_m = .01
    delta = .1
    max_mem = 1

    # Agents
    rmax = RMax(actions=actions,
                gamma=gamma,
                r_max=r_max,
                v_max=v_max,
                deduce_v_max=False,
                n_known=n_known,
                deduce_n_known=False,
                epsilon_q=epsilon_q,
                epsilon_m=epsilon_m,
                name='RMax')
    lrmax = LRMax(actions=actions,
                  gamma=gamma,
                  r_max=r_max,
                  v_max=v_max,
                  deduce_v_max=False,
                  n_known=n_known,
                  deduce_n_known=False,
                  epsilon_q=epsilon_q,
                  epsilon_m=epsilon_m,
                  delta=delta,
                  n_states=n_states,
                  max_memory_size=max_mem,
                  prior=None,
                  estimate_distances_online=True,
                  min_sampling_probability=p_min,
                  name='LRMax')
    lrmaxprior02 = LRMax(actions=actions,
                         gamma=gamma,
                         r_max=r_max,
                         v_max=v_max,
                         deduce_v_max=False,
                         n_known=n_known,
                         deduce_n_known=False,
                         epsilon_q=epsilon_q,
                         epsilon_m=epsilon_m,
                         delta=delta,
                         n_states=n_states,
                         max_memory_size=max_mem,
                         prior=0.2,
                         estimate_distances_online=False,
                         min_sampling_probability=p_min,
                         name='LRMax(Dmax0.2)')
    maxqinit = MaxQInit(actions=actions,
                        gamma=gamma,
                        r_max=r_max,
                        v_max=v_max,
                        deduce_v_max=False,
                        n_known=n_known,
                        deduce_n_known=False,
                        epsilon_q=epsilon_q,
                        epsilon_m=epsilon_m,
                        delta=delta,
                        n_states=n_states,
                        min_sampling_probability=p_min,
                        name='MaxQInit')
    lrmaxqinit = LRMaxQInit(actions=actions,
                            gamma=gamma,
                            r_max=r_max,
                            v_max=v_max,
                            deduce_v_max=False,
                            n_known=n_known,
                            deduce_n_known=False,
                            epsilon_q=epsilon_q,
                            epsilon_m=epsilon_m,
                            delta=delta,
                            n_states=n_states,
                            max_memory_size=max_mem,
                            prior=None,
                            estimate_distances_online=True,
                            min_sampling_probability=p_min,
                            name='LRMaxQInit')
    lrmaxqinitprior02 = LRMaxQInit(actions=actions,
                                   gamma=gamma,
                                   r_max=r_max,
                                   v_max=v_max,
                                   deduce_v_max=False,
                                   n_known=n_known,
                                   deduce_n_known=False,
                                   epsilon_q=epsilon_q,
                                   epsilon_m=epsilon_m,
                                   delta=delta,
                                   n_states=n_states,
                                   max_memory_size=max_mem,
                                   prior=0.2,
                                   estimate_distances_online=True,
                                   min_sampling_probability=p_min,
                                   name='LRMaxQInit(Dmax0.2)')
    agents_pool = [
        rmax, lrmax, lrmaxprior02, maxqinit, lrmaxqinit, lrmaxqinitprior02
    ]

    # Run
    run_agents_lifelong(agents_pool,
                        env_distribution,
                        name_identifier=None,
                        n_instances=1,
                        n_tasks=100,
                        n_episodes=100,
                        n_steps=13,
                        reset_at_terminal=False,
                        open_plot=False,
                        plot_title=True,
                        do_run=True,
                        do_plot=True,
                        parallel_run=True,
                        n_processes=None)
Esempio n. 15
0
 def act(self, s, r):
     self.time_step += 1
     return LRMax.act(self, s, r)
Esempio n. 16
0
 def reset(self):
     self.time_step = 0
     self.time_step_counter = []
     LRMax.reset(self)