Python Env.actionsの例

プログラミング言語: Python

名前空間/パッケージ名: env

クラス/型: Env

メソッド/関数: actions

hotexamples.comのコード掲載数: 4

Python Env.actions - 4件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのenv.Env.actionsの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

Env(30)

action_space(6)

actions(4)

declare(3)

GetInitState(2)

Step(2)

getColor(2)

addParser(2)

getRectIMG(2)

BlackWins(1)

check_env_wrt_manifest(1)

directly_defined(1)

draw(1)

draw_field(1)

getRectPos(1)

bound(1)

getRectSize(1)

getResolution(1)

get_base_url(1)

get_rnn_url(1)

log(1)

provision_resources(1)

provision_zone(1)

set_robots(1)

build_map(1)

batch_size(1)

blob_name_run_info(1)

action_bound(1)

IsTerminal(1)

_graceexit(1)

_set(1)

_update(1)

act(1)

action(1)

actionTransform(1)

action_dim(1)

blob_name(1)

actions_dim(1)

add_item(1)

add_loaded_module(1)

advance(1)

appendSeq(1)

apply_action(1)

assign_cidr(1)

black(1)

top_module(1)

コード例 #1

ファイルを表示

def on_mc():
    grid_size = 4
    env = Env(grid_size)
    policy = EspionGreedyPolicy(env.actions(), range(grid_size**2))
    Q = defaultdict(float)
    R = defaultdict(list)
    for i in range(5000):
        G = 0
        states = get_episode(env, policy)
        for (s0, a, s1, r) in reversed(states):
            G = 0.9 * G + r
            R[(s0, a)].append(G)
            Q[(s0, a)] = sum(R[(s0, a)]) / len(R[(s0, a)])

        for (s0, a, s1, r) in reversed(states):
            mm = [(x, Q[(s0, x)]) for x in env.actions()]
            action = max(mm, key=lambda x: x[1])[0]
            policy.set_max(s0, action)

    Pi = {}
    for i in range(grid_size**2):
        Pi[i] = policy.get_m(i)
    for t in env.get_t():
        Pi[t] = 'ter'

    env.render(Pi)

コード例 #2

ファイルを表示

def off_mc():
    env = Env(6)
    policy = RandomPolicy(env.actions())
    C = defaultdict(float)
    Q = defaultdict(float)
    Pi = {}
    for i in range(10000):
        G = 0
        W = 1.0
        n = 0
        states = get_episode(env, policy)
        for (s0, a, s1, r) in reversed(states):
            n += 1
            G = 0.9 * G + r
            C[(s0, a)] += W
            Q[(s0, a)] += W / C[(s0, a)] * (G - Q[(s0, a)])
            Pi[s0] = max([(x, Q[(s0, x)]) for x in env.actions()],
                         key=lambda x: x[1])[0]
            if a != Pi[s0]:
                break
            W = W / policy.get_p(s0, a)

    for t in env.get_t():
        Pi[t] = 'ter'
    env.render(Pi)

コード例 #3

ファイルを表示

ファイル: td.py プロジェクト: lkqy/reinforcement_learning

def sarsa():
    grid_size = 4
    env = Env(grid_size)
    policy = EspionGreedyPolicy(env.actions(), range(grid_size**2))
    Q = defaultdict(float)
    for i in range(5000):
        s0 = env.init()
        if env.is_t(s0):
            continue
        a0 = policy.get_a(s0)
        while not env.is_t(s0):
            s, r = env.step(a0)
            a = policy.get_a(s)
            Q[(s0, a0)] += 0.9 * (r + 0.9 * Q[(s, a)] - Q[(s0, a0)])
            s0 = s
            a0 = a
            mm = [(x, Q[(s0, x)]) for x in env.actions()]
            action = max(mm, key=lambda x:x[1])[0]
            policy.set_max(s0, action)

    Pi = {}
    for i in range(grid_size**2):
        Pi[i] = policy.get_m(i)
    for t in env.get_t():
        Pi[t] = 'ter'

    env.render(Pi)

コード例 #4

ファイルを表示

ファイル: value_approximation.py プロジェクト: lkqy/reinforcement_learning

class Sarsa(object):
    def __init__(self, size=4):
        self.grid_size = size
        self.env = Env(self.grid_size)
        self.a_id = dict([(a, i) for i, a in enumerate(self.env.actions())])
        self.policy = EspionGreedyPolicy(self.env.actions(),
                                         range(self.grid_size**2))

    def get_f(self, s, a):
        f = range(self.grid_size**2 + 4)
        f[s], f[self.a_id[a]] = 1, 1
        return f

    def sarsa(self):
        policy = self.policy
        Q = SGDRegressor()
        f = self.get_f(1, 'left')
        Q.fit([f], [1])
        for i in range(500):
            s0 = self.env.init()
            if self.env.is_t(s0):
                continue
            a0 = policy.get_a(s0)
            while not self.env.is_t(s0):
                s, r = self.env.step(a0)
                a = policy.get_a(s)
                f0 = self.get_f(s0, a0)
                f = self.get_f(s, a)
                target = Q.predict([f0])[0] + 0.9 * (
                    r + 0.9 * Q.predict([f])[0] - Q.predict([f0])[0])
                Q.partial_fit([f], [target])
                s0 = s
                a0 = a
                mm = [(x, Q.predict([self.get_f(s0, x)])[0])
                      for x in self.env.actions()]
                action = max(mm, key=lambda x: x[1])[0]
                policy.set_max(s0, action)

        Pi = {}
        for i in range(self.grid_size**2):
            Pi[i] = policy.get_m(i)
        for t in self.env.get_t():
            Pi[t] = 'ter'

        self.env.render(Pi)