Beispiel #1
0
def get_traj(test_type,
             pa,
             env,
             episode_max_length,
             pg_resume=None,
             render=False):
    """
    Run agent-environment loop for one whole episode (trajectory)
    Return dictionary of results
    """

    if test_type == 'PG':  # load trained parameters
        tf.reset_default_graph()
        # pg_learner = pg_network.PGLearner(pa)
        rl = RL_brain.PolicyGradient(
            n_actions=pa.network_output_dim,
            network_input_width=pa.network_input_width,
            network_input_height=pa.network_input_height,
            n_features=pa.network_input_width * pa.network_input_height,
            learning_rate=0.02)  #初始化一个PG的agent
        rl.load_data(pg_resume)

        # net_handle = open(pg_resume, 'rb')
        # net_params = pickle.load(net_handle)
        # pg_learner.set_net_params(net_params)

    env.reset()
    rews = []

    ob = env.observe()  #获得现在环境的观察值

    for _ in range(episode_max_length):  #在当前观察值下选择动作

        if test_type == 'PG':
            a = rl.choose_action(ob)

        elif test_type == 'Tetris':
            a = other_agents.get_packer_action(env.machine, env.job_slot)

        elif test_type == 'SJF':
            a = other_agents.get_sjf_action(env.machine, env.job_slot)

        elif test_type == 'Random':
            a = other_agents.get_random_action(env.job_slot)

        elif test_type == 'packer':
            a = other_agents.get_packer_sjf_action(env.machine, env.job_slot,
                                                   0.8)

        ob, rew, done, info = env.step(a, repeat=True)  #执行一个动作,获得执行完这个动作之后的观测值

        rews.append(rew)  ##把所有的单步奖励添加到rews里

        if done: break  #如果单个task结束,跳出循环
        if render: env.render()
        # env.render()

    return np.array(rews), info  #返回这个episode的奖励轨迹和执行的job轨迹
Beispiel #2
0
def get_traj(test_type, pa, env, episode_max_length, pg_resume=None, render=False):
    """
    Run agent-environment loop for one whole episode (trajectory)
    Return dictionary of results
    """

    if test_type == 'PG':  # load trained parameters

        pg_learner = pg_network.PGLearner(pa)

        net_handle = open(pg_resume, 'rb')
        net_params = cPickle.load(net_handle)
        pg_learner.set_net_params(net_params)

    env.reset()
    rews = []

    ob = env.observe()

    for _ in xrange(episode_max_length):

        if test_type == 'PG':
            a = pg_learner.choose_action(ob)

        elif test_type == 'Tetris':
            a = other_agents.get_packer_action(env.machines, env.job_slot)

        elif test_type == 'SJF':
            a = other_agents.get_sjf_action(env.machines, env.job_slot)

        elif test_type == 'Random':
            a = other_agents.get_random_action(env.machines, env.job_slot)

        elif test_type == 'SJF2':
            a = other_agents.get_sjf_action_for_multiple_machines(env.machines, env.job_slot)

        elif test_type == 'Packer2':
            a = other_agents.get_packer_action_for_multiple_machines(env.machines, env.job_slot)

        elif test_type == 'Tetris2':
            a = other_agents.get_packer_sjf_action_for_multiple_machines(env.machines, env.job_slot, 0.3)

        elif test_type == 'Random2':
            a = other_agents.get_random_action_for_multiple_machines(env.machines, env.job_slot)


        ob, rew, done, info = env.step(a, repeat=True)

        rews.append(rew)

        if done: break
        if render: env.render()
        # env.render()

    return np.array(rews), info
Beispiel #3
0
def get_traj(test_type,
             pa,
             env,
             episode_max_length,
             pg_resume=None,
             render=False,
             q_agent=None):
    """
    Run agent-environment loop for one whole episode (trajectory)
    Return dictionary of results
    """

    if test_type == 'PG':  # load trained parameters

        pg_learner = pg_network.PGLearner(pa)

        net_handle = open(pg_resume, 'rb')
        net_params = cPickle.load(net_handle)
        pg_learner.set_net_params(net_params)

    # Q network
    elif test_type == 'Q':
        assert (q_agent is not None)

    env.reset()
    rews = []

    ob = env.observe()

    for _ in xrange(episode_max_length):

        if test_type == 'PG':
            a = pg_learner.choose_action(ob)

        elif test_type == 'Random':
            a = other_agents.get_random_action(env.machine)

        elif test_type == 'LLQ':
            a = other_agents.get_llq_action(env.machine)

        elif test_type == 'Q':
            state = np.array(list(np.array(ob).flat))
            a = q_agent.greedy_policy(state[np.newaxis, :])

        ob, rew, done, info = env.step(a, repeat=True)
        rews.append(rew)

        if done: break
        if render: env.render()
        # env.render()

    return np.array(rews), info
def get_traj(test_type,
             pa,
             env,
             episode_max_length,
             pg_resume=None,
             render=False):
    """
    Run agent-environment loop for one whole episode (trajectory)
    Return dictionary of results
    """

    if test_type == 'PG':  # load trained parameters

        # pg_learner = pg_network.PGLearner(pa)
        rl = Policy_Network.PolicyGradient(n_actions=pa.network_output_dim,
                                           n_features=pa.network_input_width *
                                           pa.network_input_height,
                                           learning_rate=0.02)
        rl.load_data(pg_resume)

        # net_handle = open(pg_resume, 'rb')
        # net_params = pickle.load(net_handle)
        # pg_learner.set_net_params(net_params)

    env.reset()
    rews = []

    ob = env.observe()

    for _ in range(episode_max_length):

        if test_type == 'PG':
            a = rl.choose_action(ob)

        elif test_type == 'Tetris':
            a = other_agents.get_packer_action(env.machine, env.job_slot)

        elif test_type == 'SJF':
            a = other_agents.get_sjf_action(env.machine, env.job_slot)

        elif test_type == 'Random':
            a = other_agents.get_random_action(env.job_slot)

        ob, rew, done, info = env.step(a, repeat=True)

        rews.append(rew)

        if done: break
        if render: env.render()
        # env.render()

    return np.array(rews), info
Beispiel #5
0
def get_traj_halluc(test_type,
                    pa,
                    env,
                    episode_max_length,
                    pg_resume=None,
                    render=False):
    """
    Run agent-environment loop for one whole episode (trajectory)
    Return dictionary of results
    """

    if test_type == 'PG':  # load trained parameters

        pg_learner = pg_network.PGLearner(pa)

        net_handle = open(pg_resume, 'rb')
        net_params = cPickle.load(net_handle)
        pg_learner.set_net_params(net_params)

    env.reset()
    rews = []

    rnn_tmp = env.rnn

    for te in xrange(episode_max_length):

        env.rnn = None
        ori_env = copy.deepcopy(env)
        actions = []
        future = min(episode_max_length - te, pa.simu_len)
        rews_hals = np.zeros((pa.num_hal, future), dtype=float)

        if pa.rnn:
            rnn_tmp.forecast_from_history()

        for h in range(pa.num_hal):
            new_env = copy.deepcopy(ori_env)
            new_env.rnn = rnn_tmp

            if pa.rnn:
                new_env.replace_backlog_from_rnn()

            ob = new_env.observe()

            for th in range(future):

                if test_type == 'PG':
                    a = pg_learner.choose_action(ob)

                elif test_type == 'Tetris':
                    a = other_agents.get_packer_action(new_env.machine,
                                                       new_env.job_slot)

                elif test_type == 'SJF':
                    a = other_agents.get_sjf_action(new_env.machine,
                                                    new_env.job_slot)

                elif test_type == 'Random':
                    a = other_agents.get_random_action(new_env.job_slot)

                if th == 0:
                    actions.append(a)

                ob, rew, done, info = new_env.step(
                    a, repeat=True, forecasting=(pa.rnn == True))

                if done: break

                rews_hals[h][th] = rew

        sum_rews = rews_hals.sum(axis=1, dtype=float)

        a_best = actions[np.argmax(sum_rews)]
        working_env = copy.deepcopy(ori_env)
        working_env.rnn = rnn_tmp

        if pa.rnn:
            ob, rew, done, info, new_job_list = working_env.step(
                a_best, repeat=True, return_raw_jobs=True)

            for new_job in new_job_list:
                working_env.rnn.update_history(new_job)

        else:
            ob, rew, done, info = working_env.step(a_best, repeat=True)

        rews.append(rew)

        if done: break
        if render: working_env.render()
        # env.render()

    env.rnn = rnn_tmp
    return np.array(rews), info