Esempi in Python per Policy.qFunc

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: policy

Classe/tipologia: Policy

Metodo/funzione: qFunc

Esempi su hotexamples.com: 2

Policy.qFunc in Python: 2 esempi trovati. Questi sono i migliori esempi reali in Python per policy.Policy.qFunc, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

Policy(30)

action_prob(20)

__init__(13)

act(12)

checkWin(6)

build_deterministic(5)

action(4)

MakeMove(3)

build(3)

CheckLegal(3)

CRAWLER_NUMBER(2)

query(2)

qFunc(2)

choose_action(2)

fromString(2)

INVALID(2)

epsilonGreedy(2)

check_policy(1)

user(1)

classifier(1)

group(1)

script(1)

set_probability(1)

APPLY_TIME_INTERVAL(1)

actions_probas_from(1)

check(1)

calculate_probs(1)

apply_accumulated_gradients(1)

add_models(1)

B(1)

action_masks(1)

_placeholders(1)

_func(1)

__getitem__(1)

W(1)

TIME_INTERVAL_ST(1)

TIME_INTERVAL_ED(1)

CRAWLER_TYPE(1)

weights(1)

Esempio n. 1

Mostra file

File: policyGradient.py Progetto: fyabc/MSRAPaperProject

def policySARSA(mdp, epsilon, alpha, iterNum, maxWalkLen=100, echoSE=False):
    """
    Actor-Critic: actor update the policy, critic update the value.
    """

    InitParameter = 0.1

    if echoSE:
        squareErrors = []

    policy = SoftmaxPolicy(mdp)
    valuePolicy = Policy(mdp)

    for i in range(len(policy.parameters)):
        policy.parameters[i] = InitParameter
        valuePolicy.parameters[i] = 0.0

    for _ in range(iterNum):
        if echoSE:
            squareErrors.append(getSquareErrorPolicy(valuePolicy))

        state = random.choice(mdp.states)
        sFeature = mdp.getFeature(state)
        action = random.choice(mdp.actions)
        isTerminal = False

        count = 0
        while not isTerminal and count < maxWalkLen:
            isTerminal, nextState, reward, nextSFeature = mdp.transform(state, action)
            nextAction = policy.epsilonGreedy(nextSFeature, epsilon)

            valuePolicy.update(sFeature, action,
                               reward + mdp.gamma * valuePolicy.qFunc(nextSFeature, nextAction), alpha)
            policy.update(sFeature, action, valuePolicy.qFunc(sFeature, action), alpha)

            sFeature = nextSFeature
            action = nextAction
            count += 1

    if echoSE:
        return policy, squareErrors
    else:
        return policy

Esempio n. 2

Mostra file

File: valueFunctionApproximation.py Progetto: fyabc/MSRAPaperProject

def featureQLearning(mdp, epsilon, alpha, iterNum, maxWalkLen=100, echoSE=False):
    """
    qFunc = r + max_{a'}(\gamma * q(\hat{s'}, a'))
    """

    InitParameter = 0.1

    if echoSE:
        squareErrors = []

    policy = Policy(mdp)

    for i in range(len(policy.parameters)):
        policy.parameters[i] = InitParameter

    for _ in range(iterNum):
        if echoSE:
            squareErrors.append(getSquareErrorPolicy(policy))

        state = random.choice(mdp.states)
        sFeature = mdp.getFeature(state)
        action = random.choice(mdp.actions)
        isTerminal = False

        count = 0
        while not isTerminal and count < maxWalkLen:
            isTerminal, nextState, reward, nextSFeature = mdp.transform(state, action)

            maxQ = -1.0
            for nextAction in mdp.actions:
                q = policy.qFunc(nextSFeature, nextAction)
                if maxQ < q:
                    maxQ = q

            policy.update(sFeature, action, reward + mdp.gamma * maxQ, alpha)

            action = policy.epsilonGreedy(nextSFeature, epsilon)
            state = nextState
            sFeature = nextSFeature
            count += 1

    if echoSE:
        return policy, squareErrors
    else:
        return policy