def get_traj(test_type, pa, env, episode_max_length, pg_resume=None, render=False): """ Run agent-environment loop for one whole episode (trajectory) Return dictionary of results """ if test_type == 'PG': # load trained parameters tf.reset_default_graph() # pg_learner = pg_network.PGLearner(pa) rl = RL_brain.PolicyGradient( n_actions=pa.network_output_dim, network_input_width=pa.network_input_width, network_input_height=pa.network_input_height, n_features=pa.network_input_width * pa.network_input_height, learning_rate=0.02) #初始化一个PG的agent rl.load_data(pg_resume) # net_handle = open(pg_resume, 'rb') # net_params = pickle.load(net_handle) # pg_learner.set_net_params(net_params) env.reset() rews = [] ob = env.observe() #获得现在环境的观察值 for _ in range(episode_max_length): #在当前观察值下选择动作 if test_type == 'PG': a = rl.choose_action(ob) elif test_type == 'Tetris': a = other_agents.get_packer_action(env.machine, env.job_slot) elif test_type == 'SJF': a = other_agents.get_sjf_action(env.machine, env.job_slot) elif test_type == 'Random': a = other_agents.get_random_action(env.job_slot) elif test_type == 'packer': a = other_agents.get_packer_sjf_action(env.machine, env.job_slot, 0.8) ob, rew, done, info = env.step(a, repeat=True) #执行一个动作,获得执行完这个动作之后的观测值 rews.append(rew) ##把所有的单步奖励添加到rews里 if done: break #如果单个task结束,跳出循环 if render: env.render() # env.render() return np.array(rews), info #返回这个episode的奖励轨迹和执行的job轨迹
def get_traj(test_type, pa, env, episode_max_length, pg_resume=None, render=False): """ Run agent-environment loop for one whole episode (trajectory) Return dictionary of results """ if test_type == 'PG': # load trained parameters pg_learner = pg_network.PGLearner(pa) net_handle = open(pg_resume, 'rb') net_params = cPickle.load(net_handle) pg_learner.set_net_params(net_params) env.reset() rews = [] ob = env.observe() for _ in xrange(episode_max_length): if test_type == 'PG': a = pg_learner.choose_action(ob) elif test_type == 'Tetris': a = other_agents.get_packer_action(env.machines, env.job_slot) elif test_type == 'SJF': a = other_agents.get_sjf_action(env.machines, env.job_slot) elif test_type == 'Random': a = other_agents.get_random_action(env.machines, env.job_slot) elif test_type == 'SJF2': a = other_agents.get_sjf_action_for_multiple_machines(env.machines, env.job_slot) elif test_type == 'Packer2': a = other_agents.get_packer_action_for_multiple_machines(env.machines, env.job_slot) elif test_type == 'Tetris2': a = other_agents.get_packer_sjf_action_for_multiple_machines(env.machines, env.job_slot, 0.3) elif test_type == 'Random2': a = other_agents.get_random_action_for_multiple_machines(env.machines, env.job_slot) ob, rew, done, info = env.step(a, repeat=True) rews.append(rew) if done: break if render: env.render() # env.render() return np.array(rews), info
def get_traj(test_type, pa, env, episode_max_length, pg_resume=None, render=False, q_agent=None): """ Run agent-environment loop for one whole episode (trajectory) Return dictionary of results """ if test_type == 'PG': # load trained parameters pg_learner = pg_network.PGLearner(pa) net_handle = open(pg_resume, 'rb') net_params = cPickle.load(net_handle) pg_learner.set_net_params(net_params) # Q network elif test_type == 'Q': assert (q_agent is not None) env.reset() rews = [] ob = env.observe() for _ in xrange(episode_max_length): if test_type == 'PG': a = pg_learner.choose_action(ob) elif test_type == 'Random': a = other_agents.get_random_action(env.machine) elif test_type == 'LLQ': a = other_agents.get_llq_action(env.machine) elif test_type == 'Q': state = np.array(list(np.array(ob).flat)) a = q_agent.greedy_policy(state[np.newaxis, :]) ob, rew, done, info = env.step(a, repeat=True) rews.append(rew) if done: break if render: env.render() # env.render() return np.array(rews), info
def get_traj(test_type, pa, env, episode_max_length, pg_resume=None, render=False): """ Run agent-environment loop for one whole episode (trajectory) Return dictionary of results """ if test_type == 'PG': # load trained parameters # pg_learner = pg_network.PGLearner(pa) rl = Policy_Network.PolicyGradient(n_actions=pa.network_output_dim, n_features=pa.network_input_width * pa.network_input_height, learning_rate=0.02) rl.load_data(pg_resume) # net_handle = open(pg_resume, 'rb') # net_params = pickle.load(net_handle) # pg_learner.set_net_params(net_params) env.reset() rews = [] ob = env.observe() for _ in range(episode_max_length): if test_type == 'PG': a = rl.choose_action(ob) elif test_type == 'Tetris': a = other_agents.get_packer_action(env.machine, env.job_slot) elif test_type == 'SJF': a = other_agents.get_sjf_action(env.machine, env.job_slot) elif test_type == 'Random': a = other_agents.get_random_action(env.job_slot) ob, rew, done, info = env.step(a, repeat=True) rews.append(rew) if done: break if render: env.render() # env.render() return np.array(rews), info
def get_traj_halluc(test_type, pa, env, episode_max_length, pg_resume=None, render=False): """ Run agent-environment loop for one whole episode (trajectory) Return dictionary of results """ if test_type == 'PG': # load trained parameters pg_learner = pg_network.PGLearner(pa) net_handle = open(pg_resume, 'rb') net_params = cPickle.load(net_handle) pg_learner.set_net_params(net_params) env.reset() rews = [] rnn_tmp = env.rnn for te in xrange(episode_max_length): env.rnn = None ori_env = copy.deepcopy(env) actions = [] future = min(episode_max_length - te, pa.simu_len) rews_hals = np.zeros((pa.num_hal, future), dtype=float) if pa.rnn: rnn_tmp.forecast_from_history() for h in range(pa.num_hal): new_env = copy.deepcopy(ori_env) new_env.rnn = rnn_tmp if pa.rnn: new_env.replace_backlog_from_rnn() ob = new_env.observe() for th in range(future): if test_type == 'PG': a = pg_learner.choose_action(ob) elif test_type == 'Tetris': a = other_agents.get_packer_action(new_env.machine, new_env.job_slot) elif test_type == 'SJF': a = other_agents.get_sjf_action(new_env.machine, new_env.job_slot) elif test_type == 'Random': a = other_agents.get_random_action(new_env.job_slot) if th == 0: actions.append(a) ob, rew, done, info = new_env.step( a, repeat=True, forecasting=(pa.rnn == True)) if done: break rews_hals[h][th] = rew sum_rews = rews_hals.sum(axis=1, dtype=float) a_best = actions[np.argmax(sum_rews)] working_env = copy.deepcopy(ori_env) working_env.rnn = rnn_tmp if pa.rnn: ob, rew, done, info, new_job_list = working_env.step( a_best, repeat=True, return_raw_jobs=True) for new_job in new_job_list: working_env.rnn.update_history(new_job) else: ob, rew, done, info = working_env.step(a_best, repeat=True) rews.append(rew) if done: break if render: working_env.render() # env.render() env.rnn = rnn_tmp return np.array(rews), info