Beispiel #1
0
def run(env_name, version, act_rep, max_steps, rollout_agent_name,
        behavior_agent_name, eps_greedy, sim_steps, search_horizont,
        gamma=1., exploration=1., prune_tree=False, report_freq=100,
        n_runs=1, save_dir=None, save_freq=10, process=0):

    def save_data():
        if save_dir is not None and len(frames) > 0:
            run_data = {
                'frames': frames,
                'actions': actions,
                'reward': total_reward,
                'action_visits': action_visits,
                'action_values': action_values,
                'rewards': rewards,
                'action_meanings': env.env.get_action_meanings(),
            }
            fname = os.path.join(save_dir, 'run_process_{}_run_{}_steps_{}.pkl'.format(process, n_run, step))
            with open(fname, 'wb') as f:
                cPickle.dump(run_data, f, -1)

            del actions[:]
            del frames[:]
            del action_visits[:]
            del action_values[:]
            del rewards[:]

    env = create_env(env_name, version, act_rep)
    uct.Node.n_actions = env.action_space.n

    # agent for rollouts
    if rollout_agent_name == 'random' or rollout_agent_name is None:
        rollout_agent = RandomAgent(env.action_space.n)
    else:
        rollout_agent = KerasAgent(rollout_agent_name)

    # agent for action selections
    if behavior_agent_name == 'random':
        behavior_agent = RandomAgent(env.action_space.n)
    elif behavior_agent_name == 'uct' or behavior_agent_name is None:
        behavior_agent = 'uct'
    else:
        behavior_agent = KerasAgent(behavior_agent_name)

    if save_dir is not None:
        actions = []
        frames = []
        action_visits = []
        action_values = []
        rewards = []

    for n_run in xrange(n_runs):
        terminal = False

        env.reset()
        _frame = env.env._get_image()

        node = uct.Node(env.clone_state())

        total_reward = 0
        step = 0
        t_start = t0 = time()
        while not terminal:
            # choose uct action
            a_uct = uct.uct_action(env, rollout_agent, node, sim_steps, search_horizont, gamma, exploration)

            # choose action in environment
            if np.random.rand() < eps_greedy:
                a = env.action_space.sample()
            elif behavior_agent == 'uct':
                a = a_uct
            else:
                a = behavior_agent.choose_action(_frame)

            if save_dir is not None:
                actions.append(a_uct)
                frames.append(_frame)
                action_visits.append(node.a_visits)
                action_values.append(node.a_values)

            # do step in environment
            env.restore_state(node.state)
            frame, reward, terminal, _ = env.step(a)
            _frame = env.env._get_image()

            if save_dir is not None:
                rewards.append(reward)

            # create new tree or try to use old tree
            if prune_tree:
                if frame in node.childs[a]:
                    node = node.childs[a][frame]
                    node.parent = None
                else:
                    node = uct.Node(env.clone_state())
            else:
                node = uct.Node(env.clone_state())

            total_reward += reward
            step += 1

            # report progress
            if step % report_freq == 0:
                print 'process: {} run: {}, steps: {}, time: {:.2f}, total reward: {:.2f}'.\
                    format(process, n_run+1, step, time() - t0, total_reward)
                t0 = time()

            # save intermediate result
            if step % save_freq == 0:
                save_data()

            if 0 < max_steps < step:
                break

        print '\nprocess: {}, run: {}, total steps: {}, total time: {:.2f}, total reward: {:.2f}'.\
            format(process, n_run+1, step, time() - t_start, total_reward)

        # save last chunk of data
        save_data()

    env.close()
import numpy as np
import random
from environment import Environment
from agents import RandomAgent
from agents import ValueApproxAgent

env = Environment(5)
agent = RandomAgent(env.action_space)

total_rewards = 0
for i in range(1000):
    action = agent.choose_action()
    reward = env.try_arm(action)
    total_rewards += reward

print('Total Rewards From RandomAgent: ', total_rewards)

agent = ValueApproxAgent(env.action_space)
total_rewards = 0
for i in range(1000):
    action = agent.choose_action()
    reward = env.try_arm(action)
    print('action: ', action)
    agent.learn(action, reward)
    total_rewards += reward

print('Total Rewards From ValueApproxAgent: ', total_rewards)
print(agent.approx_values, env._probs)