Exemple #1
0
 def __init__(self, algo, pol, phi, update_params=dict()):
     self.algo = algo
     self.pol = pol
     if phi is None:
         self.phi = lambda x: x
     else:
         self.phi = phi
     # default parameter functions to use for updates
     self.param_funcs = {k: parametric.to_parameter(v)
                         for k, v in update_params.items()}
Exemple #2
0
    def __init__(self, algo, target, behavior, phi=None, update_params=dict()):
        self.algo = algo
        self.target = target
        self.behavior = behavior

        # set the feature function
        if phi is None:
            self.phi = lambda x: x
        else:
            self.phi = phi

        # default parameters to use for updating
        self.param_funcs = {k: parametric.to_parameter(v)
                            for k, v in update_params.items()}
Exemple #3
0
    def __init__(self, algo, behavior, phi=None, update_params=dict()):
        self.algo = algo
        self.behavior = behavior

        # set the feature function
        if phi is None:
            self.phi = lambda x: x
        else:
            self.phi = phi

        # default parameters to use for updating
        self.param_funcs = {k: parametric.to_parameter(v)
                            for k, v in update_params.items()}

        # in the on-policy setting, `rho` is always equal to one.
        self.rho = 1
Exemple #4
0
def stepwise_return(lst, gamma):
    """Compute the return at each step in a trajectory.

    Uses the fact that the return at each step 'backwards' from the end of the
    trajectory is the immediate reward plus the discounted return from the next
    state.
    """
    # convert gamma to a state-dependent parameter
    gamma = to_parameter(gamma)
    rewards = get_rewards(lst)
    gmlst = get_gammas(lst, gamma)
    n = len(lst)
    ret = []
    tmp = 0
    for r, gm in reversed(list(zip(rewards, gmlst))):
        tmp *= gm
        tmp += r
        ret.append(tmp)
    return list(reversed(ret))
Exemple #5
0
def run_policy_verbose(pol, env, max_steps, param_funcs=dict()):
    """Run a policy in an environment for a specified number of steps.

    Provide enough information to run the online algorithms offline by recording
    each step's entire context, potentially including the values of parameter
    functions at each point in time.
    """
    ret = []
    t = 0

    # convert parameter functions to `Parameter` type, if needed
    param_funcs = {k: to_parameter(v) for k, v in param_funcs.items()}

    # reset the environment and get initial state
    env.reset()
    s = env.state
    while not env.is_terminal() and t < max_steps:
        # record the context of the time step
        actions = env.actions
        a = pol.choose(s, actions)
        r, sp = env.do(a)

        # record the transition information
        ctx = {'s': s, 'a': a, 'r': r, 'sp': sp, 'actions': actions}

        # record values of parameters for the transition
        for name, func in param_funcs.items():
            ctx[name] = func(s, a, sp)

        # log the context of the transition
        ret.append(ctx)

        # prepare for next iteration
        s = sp
        t += 1
    return ret
Exemple #6
0
def stepwise_params(lst, param):
    param = to_parameter(param)
    return [param(s) for s in get_states(lst)]
Exemple #7
0
def get_gammas(lst, gamma):
    # convert gamma to state-dependent parameter
    gamma = to_parameter(gamma)
    return [gamma(s) for s in get_states(lst)]