def __init__(self, algo, pol, phi, update_params=dict()): self.algo = algo self.pol = pol if phi is None: self.phi = lambda x: x else: self.phi = phi # default parameter functions to use for updates self.param_funcs = {k: parametric.to_parameter(v) for k, v in update_params.items()}
def __init__(self, algo, target, behavior, phi=None, update_params=dict()): self.algo = algo self.target = target self.behavior = behavior # set the feature function if phi is None: self.phi = lambda x: x else: self.phi = phi # default parameters to use for updating self.param_funcs = {k: parametric.to_parameter(v) for k, v in update_params.items()}
def __init__(self, algo, behavior, phi=None, update_params=dict()): self.algo = algo self.behavior = behavior # set the feature function if phi is None: self.phi = lambda x: x else: self.phi = phi # default parameters to use for updating self.param_funcs = {k: parametric.to_parameter(v) for k, v in update_params.items()} # in the on-policy setting, `rho` is always equal to one. self.rho = 1
def stepwise_return(lst, gamma): """Compute the return at each step in a trajectory. Uses the fact that the return at each step 'backwards' from the end of the trajectory is the immediate reward plus the discounted return from the next state. """ # convert gamma to a state-dependent parameter gamma = to_parameter(gamma) rewards = get_rewards(lst) gmlst = get_gammas(lst, gamma) n = len(lst) ret = [] tmp = 0 for r, gm in reversed(list(zip(rewards, gmlst))): tmp *= gm tmp += r ret.append(tmp) return list(reversed(ret))
def run_policy_verbose(pol, env, max_steps, param_funcs=dict()): """Run a policy in an environment for a specified number of steps. Provide enough information to run the online algorithms offline by recording each step's entire context, potentially including the values of parameter functions at each point in time. """ ret = [] t = 0 # convert parameter functions to `Parameter` type, if needed param_funcs = {k: to_parameter(v) for k, v in param_funcs.items()} # reset the environment and get initial state env.reset() s = env.state while not env.is_terminal() and t < max_steps: # record the context of the time step actions = env.actions a = pol.choose(s, actions) r, sp = env.do(a) # record the transition information ctx = {'s': s, 'a': a, 'r': r, 'sp': sp, 'actions': actions} # record values of parameters for the transition for name, func in param_funcs.items(): ctx[name] = func(s, a, sp) # log the context of the transition ret.append(ctx) # prepare for next iteration s = sp t += 1 return ret
def stepwise_params(lst, param): param = to_parameter(param) return [param(s) for s in get_states(lst)]
def get_gammas(lst, gamma): # convert gamma to state-dependent parameter gamma = to_parameter(gamma) return [gamma(s) for s in get_states(lst)]