def run(env_name, version, act_rep, max_steps, rollout_agent_name, behavior_agent_name, eps_greedy, sim_steps, search_horizont, gamma=1., exploration=1., prune_tree=False, report_freq=100, n_runs=1, save_dir=None, save_freq=10, process=0): def save_data(): if save_dir is not None and len(frames) > 0: run_data = { 'frames': frames, 'actions': actions, 'reward': total_reward, 'action_visits': action_visits, 'action_values': action_values, 'rewards': rewards, 'action_meanings': env.env.get_action_meanings(), } fname = os.path.join(save_dir, 'run_process_{}_run_{}_steps_{}.pkl'.format(process, n_run, step)) with open(fname, 'wb') as f: cPickle.dump(run_data, f, -1) del actions[:] del frames[:] del action_visits[:] del action_values[:] del rewards[:] env = create_env(env_name, version, act_rep) uct.Node.n_actions = env.action_space.n # agent for rollouts if rollout_agent_name == 'random' or rollout_agent_name is None: rollout_agent = RandomAgent(env.action_space.n) else: rollout_agent = KerasAgent(rollout_agent_name) # agent for action selections if behavior_agent_name == 'random': behavior_agent = RandomAgent(env.action_space.n) elif behavior_agent_name == 'uct' or behavior_agent_name is None: behavior_agent = 'uct' else: behavior_agent = KerasAgent(behavior_agent_name) if save_dir is not None: actions = [] frames = [] action_visits = [] action_values = [] rewards = [] for n_run in xrange(n_runs): terminal = False env.reset() _frame = env.env._get_image() node = uct.Node(env.clone_state()) total_reward = 0 step = 0 t_start = t0 = time() while not terminal: # choose uct action a_uct = uct.uct_action(env, rollout_agent, node, sim_steps, search_horizont, gamma, exploration) # choose action in environment if np.random.rand() < eps_greedy: a = env.action_space.sample() elif behavior_agent == 'uct': a = a_uct else: a = behavior_agent.choose_action(_frame) if save_dir is not None: actions.append(a_uct) frames.append(_frame) action_visits.append(node.a_visits) action_values.append(node.a_values) # do step in environment env.restore_state(node.state) frame, reward, terminal, _ = env.step(a) _frame = env.env._get_image() if save_dir is not None: rewards.append(reward) # create new tree or try to use old tree if prune_tree: if frame in node.childs[a]: node = node.childs[a][frame] node.parent = None else: node = uct.Node(env.clone_state()) else: node = uct.Node(env.clone_state()) total_reward += reward step += 1 # report progress if step % report_freq == 0: print 'process: {} run: {}, steps: {}, time: {:.2f}, total reward: {:.2f}'.\ format(process, n_run+1, step, time() - t0, total_reward) t0 = time() # save intermediate result if step % save_freq == 0: save_data() if 0 < max_steps < step: break print '\nprocess: {}, run: {}, total steps: {}, total time: {:.2f}, total reward: {:.2f}'.\ format(process, n_run+1, step, time() - t_start, total_reward) # save last chunk of data save_data() env.close()
import numpy as np import random from environment import Environment from agents import RandomAgent from agents import ValueApproxAgent env = Environment(5) agent = RandomAgent(env.action_space) total_rewards = 0 for i in range(1000): action = agent.choose_action() reward = env.try_arm(action) total_rewards += reward print('Total Rewards From RandomAgent: ', total_rewards) agent = ValueApproxAgent(env.action_space) total_rewards = 0 for i in range(1000): action = agent.choose_action() reward = env.try_arm(action) print('action: ', action) agent.learn(action, reward) total_rewards += reward print('Total Rewards From ValueApproxAgent: ', total_rewards) print(agent.approx_values, env._probs)