Esempi in Python per Policy.epsilonGreedy

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: policy

Classe/tipologia: Policy

Metodo/funzione: epsilonGreedy

Esempi su hotexamples.com: 2

Policy.epsilonGreedy in Python: 2 esempi trovati. Questi sono i migliori esempi reali in Python per policy.Policy.epsilonGreedy, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

Policy(30)

action_prob(20)

__init__(13)

act(12)

checkWin(6)

build_deterministic(5)

action(4)

MakeMove(3)

build(3)

CheckLegal(3)

CRAWLER_NUMBER(2)

query(2)

qFunc(2)

choose_action(2)

fromString(2)

INVALID(2)

epsilonGreedy(2)

check_policy(1)

user(1)

classifier(1)

group(1)

script(1)

set_probability(1)

APPLY_TIME_INTERVAL(1)

actions_probas_from(1)

check(1)

calculate_probs(1)

apply_accumulated_gradients(1)

add_models(1)

B(1)

action_masks(1)

_placeholders(1)

_func(1)

__getitem__(1)

W(1)

TIME_INTERVAL_ST(1)

TIME_INTERVAL_ED(1)

CRAWLER_TYPE(1)

weights(1)

Esempio n. 1

Mostra file

File: valueFunctionApproximation.py Progetto: fyabc/MSRAPaperProject

def featureMCControl(mdp, epsilon, alpha, iterNum, maxWalkLen=100, echoSE=False):
    """
    qFunc = g_t
    """

    InitParameter = 0.1

    if echoSE:
        squareErrors = []

    policy = Policy(mdp)

    for i in range(len(policy.parameters)):
        policy.parameters[i] = InitParameter

    for _ in range(iterNum):
        if echoSE:
            squareErrors.append(getSquareErrorPolicy(policy))

        states, sFeatures, actions, rewards = [], [], [], []

        state = random.choice(mdp.states)
        sFeature = mdp.getFeature(state)
        isTerminal = False

        count = 0
        while not isTerminal and count < maxWalkLen:
            action = policy.epsilonGreedy(sFeature, epsilon)
            isTerminal, nextState, reward, nextSFeature = mdp.transform(state, action)

            states.append(state)
            sFeatures.append(sFeature)
            rewards.append(reward)
            actions.append(action)

            state = nextState
            sFeature = nextSFeature
            count += 1

        g = 0.0
        for i in range(len(states) - 1, -1, -1):
            g *= mdp.gamma
            g += rewards[i]

        for i in range(len(states)):
            policy.update(sFeatures[i], actions[i], g, alpha)

            g -= rewards[i]
            g /= mdp.gamma

    if echoSE:
        return policy, squareErrors
    else:
        return policy

Esempio n. 2

Mostra file

File: valueFunctionApproximation.py Progetto: fyabc/MSRAPaperProject

def featureQLearning(mdp, epsilon, alpha, iterNum, maxWalkLen=100, echoSE=False):
    """
    qFunc = r + max_{a'}(\gamma * q(\hat{s'}, a'))
    """

    InitParameter = 0.1

    if echoSE:
        squareErrors = []

    policy = Policy(mdp)

    for i in range(len(policy.parameters)):
        policy.parameters[i] = InitParameter

    for _ in range(iterNum):
        if echoSE:
            squareErrors.append(getSquareErrorPolicy(policy))

        state = random.choice(mdp.states)
        sFeature = mdp.getFeature(state)
        action = random.choice(mdp.actions)
        isTerminal = False

        count = 0
        while not isTerminal and count < maxWalkLen:
            isTerminal, nextState, reward, nextSFeature = mdp.transform(state, action)

            maxQ = -1.0
            for nextAction in mdp.actions:
                q = policy.qFunc(nextSFeature, nextAction)
                if maxQ < q:
                    maxQ = q

            policy.update(sFeature, action, reward + mdp.gamma * maxQ, alpha)

            action = policy.epsilonGreedy(nextSFeature, epsilon)
            state = nextState
            sFeature = nextSFeature
            count += 1

    if echoSE:
        return policy, squareErrors
    else:
        return policy