Esempio n. 1
0
def td_control(transitions: Iterable[mdp.TransitionStep[S, A]],
               actions: Callable[[S], Iterable[A]],
               approx_0: FunctionApprox[Tuple[S, A]],
               γ: float) -> Iterator[FunctionApprox[Tuple[S, A]]]:
    '''Return policies that try to maximize the reward based on the given
    set of experiences.

    Arguments:
      transitions -- a sequence of state, action, reward, state (S, A, R, S')
      actions -- a function returning the possible actions for a given state
      approx_0 -- initial approximation of q function
      γ -- discount rate (0 < γ ≤ 1)

    Returns:
      an itertor of approximations of the q function based on the
      transitions given as input

    '''
    def step(
            q: FunctionApprox[Tuple[S, A]],
            transition: mdp.TransitionStep[S,
                                           A]) -> FunctionApprox[Tuple[S, A]]:
        next_reward = max(
            q((transition.next_state, a))
            for a in actions(transition.next_state))
        return q.update([((transition.state, transition.action),
                          transition.reward + γ * next_reward)])

    return iterate.accumulate(transitions, step, initial=approx_0)
Esempio n. 2
0
def q_learning_external_transitions(
        transitions: Iterable[TransitionStep[S, A]],
        actions: Callable[[NonTerminal[S]],
                          Iterable[A]], approx_0: QValueFunctionApprox[S, A],
        γ: float) -> Iterator[QValueFunctionApprox[S, A]]:
    '''Return policies that try to maximize the reward based on the given
    set of experiences.

    Arguments:
      transitions -- a sequence of state, action, reward, state (S, A, R, S')
      actions -- a function returning the possible actions for a given state
      approx_0 -- initial approximation of q function
      γ -- discount rate (0 < γ ≤ 1)

    Returns:
      an itertor of approximations of the q function based on the
      transitions given as input

    '''
    def step(q: QValueFunctionApprox[S, A],
             transition: TransitionStep[S, A]) -> QValueFunctionApprox[S, A]:
        next_return: float = max(
            q((transition.next_state, a))
            for a in actions(transition.next_state)) if isinstance(
                transition.next_state, NonTerminal) else 0.
        return q.update([((transition.state, transition.action),
                          transition.reward + γ * next_return)])

    return iterate.accumulate(transitions, step, initial=approx_0)
Esempio n. 3
0
def returns(trace, γ, tolerance):
    '''Given an iterator of states and rewards, calculate the return of
    the first N states.

    Arguments:
    rewards -- instantaneous rewards
    γ -- the discount factor (0 < γ ≤ 1)
    tolerance -- a small value—we stop iterating once γᵏ ≤ tolerance

    '''
    trace = iter(trace)

    max_steps = round(math.log(tolerance) / math.log(γ)) if γ < 1 else None
    if max_steps is not None:
        trace = itertools.islice(trace, max_steps * 2)

    *transitions, last_transition = list(trace)

    return_steps = iterate.accumulate(
        reversed(transitions),
        func=lambda next, curr: curr.add_return(γ, next.return_),
        initial=last_transition.add_return(γ, 0))
    return_steps = reversed(list(return_steps))

    if max_steps is not None:
        return_steps = itertools.islice(return_steps, max_steps)

    return return_steps
Esempio n. 4
0
def td_prediction(
    transitions: Iterable[mp.TransitionStep[S]],
    approx_0: FunctionApprox[S],
    γ: float,
) -> Iterator[FunctionApprox[S]]:
    """Evaluate an MRP using TD(0) using the given sequence of
    transitions.

    Each value this function yields represents the approximated value
    function for the MRP after an additional transition.

    Arguments:
      transitions -- a sequence of transitions from an MRP which don't
                     have to be in order or from the same simulation
      approx_0 -- initial approximation of value function
      γ -- discount rate (0 < γ ≤ 1)

    """

    def step(
        v: FunctionApprox[S], transition: mp.TransitionStep[S]
    ) -> FunctionApprox[S]:
        return v.update(
            [(transition.state, transition.reward + γ * v(transition.next_state))]
        )

    return iterate.accumulate(transitions, step, initial=approx_0)
Esempio n. 5
0
 def iterate_updates(
         self: F,
         xy_seq_stream: Iterator[Iterable[Tuple[X, float]]]) -> Iterator[F]:
     '''Given a stream (Iterator) of data sets of (x,y) pairs,
     perform a series of incremental updates to the internal
     parameters (using update method), with each internal
     parameter update done for each data set of (x,y) pairs in the
     input stream of xy_seq_stream
     '''
     return iterate.accumulate(xy_seq_stream,
                               lambda fa, xy: fa.update(xy),
                               initial=self)
Esempio n. 6
0
def qlearning_control_fapprox(
        transitions: Iterable[TransitionStep[S, A]],
        actions: Callable[[S],
                          Iterable[A]], approx_0: FunctionApprox[Tuple[S, A]],
        γ: float) -> Iterator[FunctionApprox[Tuple[S, A]]]:
    def step(q, transition):
        next_reward = max(
            q((transition.next_state, a))
            for a in actions(transition.next_state))
        return q.update([((transition.state, transition.action),
                          transition.reward + γ * next_reward)])

    return iterate.accumulate(transitions, step, initial=approx_0)
Esempio n. 7
0
def sarsa_control_fapprox(transitions: Iterable[TransitionStep[S, A]],
                          actions: Callable[[S], Iterable[A]],
                          approx_0: FunctionApprox[Tuple[S, A]], γ: float,
                          eps: float) -> Iterator[FunctionApprox[Tuple[S, A]]]:
    def step(q, transition):
        if np.random.random() > eps:
            next_reward = max(
                q((transition.next_state, a))
                for a in actions(transition.next_state))
        else:
            next_action = actions(transition.next_state)[np.random.randint(
                len(actions(transition.next_state)))]
            next_reward = ((transition.next_state, next_action))
        return q.update([((transition.state, transition.action),
                          transition.reward + γ * next_reward)])

    return iterate.accumulate(transitions, step, initial=approx_0)
Esempio n. 8
0
def batch_td_prediction(
        transitions: Iterable[mp.TransitionStep[S]],
        approx_0: ValueFunctionApprox[S],
        γ: float,
        convergence_tolerance: float = 1e-5) -> ValueFunctionApprox[S]:
    '''transitions is a finite iterable'''
    def step(v: ValueFunctionApprox[S],
             tr_seq: Sequence[mp.TransitionStep[S]]) -> ValueFunctionApprox[S]:
        return v.update([(tr.state,
                          tr.reward + γ * extended_vf(v, tr.next_state))
                         for tr in tr_seq])

    def done(a: ValueFunctionApprox[S],
             b: ValueFunctionApprox[S],
             convergence_tolerance=convergence_tolerance) -> bool:
        return b.within(a, convergence_tolerance)

    return iterate.converged(iterate.accumulate(itertools.repeat(
        list(transitions)),
                                                step,
                                                initial=approx_0),
                             done=done)
Esempio n. 9
0
def evaluate_mrp_funapprox_bootstrap(transitions: Iterable[
    mp.TransitionStep[S]], approx_0: FunctionApprox[S], γ: float,
                                     n: int) -> Iterator[FunctionApprox[S]]:
    '''
    n-Step Bootstrapping Prediction
    for the Function Approximation case
    '''
    tolerance: float = γ**n  # in order to include n rewards in each bootstrap return
    bootstrap_return_steps: Iterator[mp.ReturnStep] = returns(
        transitions, γ, tolerance)
    bootstr_return_steps_indexed = zip(itertools.count(),
                                       bootstrap_return_steps)

    def step(v, indexed_return_step):
        index = indexed_return_step[0]
        step = indexed_return_step[1]
        step_n = next(
            itertools.islice(bootstrap_return_steps, index + n, None), None)
        return v.update([(step.state,
                          step.return_ + γ**n * v(step_n.next_state))])

    return iterate.accumulate(bootstr_return_steps_indexed,
                              step,
                              initial=approx_0)