def td_control(transitions: Iterable[mdp.TransitionStep[S, A]], actions: Callable[[S], Iterable[A]], approx_0: FunctionApprox[Tuple[S, A]], γ: float) -> Iterator[FunctionApprox[Tuple[S, A]]]: '''Return policies that try to maximize the reward based on the given set of experiences. Arguments: transitions -- a sequence of state, action, reward, state (S, A, R, S') actions -- a function returning the possible actions for a given state approx_0 -- initial approximation of q function γ -- discount rate (0 < γ ≤ 1) Returns: an itertor of approximations of the q function based on the transitions given as input ''' def step( q: FunctionApprox[Tuple[S, A]], transition: mdp.TransitionStep[S, A]) -> FunctionApprox[Tuple[S, A]]: next_reward = max( q((transition.next_state, a)) for a in actions(transition.next_state)) return q.update([((transition.state, transition.action), transition.reward + γ * next_reward)]) return iterate.accumulate(transitions, step, initial=approx_0)
def q_learning_external_transitions( transitions: Iterable[TransitionStep[S, A]], actions: Callable[[NonTerminal[S]], Iterable[A]], approx_0: QValueFunctionApprox[S, A], γ: float) -> Iterator[QValueFunctionApprox[S, A]]: '''Return policies that try to maximize the reward based on the given set of experiences. Arguments: transitions -- a sequence of state, action, reward, state (S, A, R, S') actions -- a function returning the possible actions for a given state approx_0 -- initial approximation of q function γ -- discount rate (0 < γ ≤ 1) Returns: an itertor of approximations of the q function based on the transitions given as input ''' def step(q: QValueFunctionApprox[S, A], transition: TransitionStep[S, A]) -> QValueFunctionApprox[S, A]: next_return: float = max( q((transition.next_state, a)) for a in actions(transition.next_state)) if isinstance( transition.next_state, NonTerminal) else 0. return q.update([((transition.state, transition.action), transition.reward + γ * next_return)]) return iterate.accumulate(transitions, step, initial=approx_0)
def returns(trace, γ, tolerance): '''Given an iterator of states and rewards, calculate the return of the first N states. Arguments: rewards -- instantaneous rewards γ -- the discount factor (0 < γ ≤ 1) tolerance -- a small value—we stop iterating once γᵏ ≤ tolerance ''' trace = iter(trace) max_steps = round(math.log(tolerance) / math.log(γ)) if γ < 1 else None if max_steps is not None: trace = itertools.islice(trace, max_steps * 2) *transitions, last_transition = list(trace) return_steps = iterate.accumulate( reversed(transitions), func=lambda next, curr: curr.add_return(γ, next.return_), initial=last_transition.add_return(γ, 0)) return_steps = reversed(list(return_steps)) if max_steps is not None: return_steps = itertools.islice(return_steps, max_steps) return return_steps
def td_prediction( transitions: Iterable[mp.TransitionStep[S]], approx_0: FunctionApprox[S], γ: float, ) -> Iterator[FunctionApprox[S]]: """Evaluate an MRP using TD(0) using the given sequence of transitions. Each value this function yields represents the approximated value function for the MRP after an additional transition. Arguments: transitions -- a sequence of transitions from an MRP which don't have to be in order or from the same simulation approx_0 -- initial approximation of value function γ -- discount rate (0 < γ ≤ 1) """ def step( v: FunctionApprox[S], transition: mp.TransitionStep[S] ) -> FunctionApprox[S]: return v.update( [(transition.state, transition.reward + γ * v(transition.next_state))] ) return iterate.accumulate(transitions, step, initial=approx_0)
def iterate_updates( self: F, xy_seq_stream: Iterator[Iterable[Tuple[X, float]]]) -> Iterator[F]: '''Given a stream (Iterator) of data sets of (x,y) pairs, perform a series of incremental updates to the internal parameters (using update method), with each internal parameter update done for each data set of (x,y) pairs in the input stream of xy_seq_stream ''' return iterate.accumulate(xy_seq_stream, lambda fa, xy: fa.update(xy), initial=self)
def qlearning_control_fapprox( transitions: Iterable[TransitionStep[S, A]], actions: Callable[[S], Iterable[A]], approx_0: FunctionApprox[Tuple[S, A]], γ: float) -> Iterator[FunctionApprox[Tuple[S, A]]]: def step(q, transition): next_reward = max( q((transition.next_state, a)) for a in actions(transition.next_state)) return q.update([((transition.state, transition.action), transition.reward + γ * next_reward)]) return iterate.accumulate(transitions, step, initial=approx_0)
def sarsa_control_fapprox(transitions: Iterable[TransitionStep[S, A]], actions: Callable[[S], Iterable[A]], approx_0: FunctionApprox[Tuple[S, A]], γ: float, eps: float) -> Iterator[FunctionApprox[Tuple[S, A]]]: def step(q, transition): if np.random.random() > eps: next_reward = max( q((transition.next_state, a)) for a in actions(transition.next_state)) else: next_action = actions(transition.next_state)[np.random.randint( len(actions(transition.next_state)))] next_reward = ((transition.next_state, next_action)) return q.update([((transition.state, transition.action), transition.reward + γ * next_reward)]) return iterate.accumulate(transitions, step, initial=approx_0)
def batch_td_prediction( transitions: Iterable[mp.TransitionStep[S]], approx_0: ValueFunctionApprox[S], γ: float, convergence_tolerance: float = 1e-5) -> ValueFunctionApprox[S]: '''transitions is a finite iterable''' def step(v: ValueFunctionApprox[S], tr_seq: Sequence[mp.TransitionStep[S]]) -> ValueFunctionApprox[S]: return v.update([(tr.state, tr.reward + γ * extended_vf(v, tr.next_state)) for tr in tr_seq]) def done(a: ValueFunctionApprox[S], b: ValueFunctionApprox[S], convergence_tolerance=convergence_tolerance) -> bool: return b.within(a, convergence_tolerance) return iterate.converged(iterate.accumulate(itertools.repeat( list(transitions)), step, initial=approx_0), done=done)
def evaluate_mrp_funapprox_bootstrap(transitions: Iterable[ mp.TransitionStep[S]], approx_0: FunctionApprox[S], γ: float, n: int) -> Iterator[FunctionApprox[S]]: ''' n-Step Bootstrapping Prediction for the Function Approximation case ''' tolerance: float = γ**n # in order to include n rewards in each bootstrap return bootstrap_return_steps: Iterator[mp.ReturnStep] = returns( transitions, γ, tolerance) bootstr_return_steps_indexed = zip(itertools.count(), bootstrap_return_steps) def step(v, indexed_return_step): index = indexed_return_step[0] step = indexed_return_step[1] step_n = next( itertools.islice(bootstrap_return_steps, index + n, None), None) return v.update([(step.state, step.return_ + γ**n * v(step_n.next_state))]) return iterate.accumulate(bootstr_return_steps_indexed, step, initial=approx_0)