コード例 #1
0
ファイル: train.py プロジェクト: zchao520/Stick-A2C
def agent(agent_id, net_params_queue, exp_queue):
    env = ABREnv(agent_id)
    with tf.Session() as sess, open(
            SUMMARY_DIR + '/log_agent_' + str(agent_id), 'w') as log_file:
        actor = network.Network(sess,
                                state_dim=S_DIM,
                                action_dim=A_DIM,
                                learning_rate=ACTOR_LR_RATE)

        # initial synchronization of the network parameters from the coordinator
        actor_net_params = net_params_queue.get()
        actor.set_network_params(actor_net_params)

        time_stamp = 0

        for epoch in range(TRAIN_EPOCH):
            obs = env.reset()
            s_batch, a_batch, r_batch = [], [], []
            for step in range(TRAIN_SEQ_LEN):
                s_batch.append(obs)

                buffer_bound = actor.predict(
                    np.reshape(obs, (1, S_DIM[0], S_DIM[1])))
                obs, rew, done, info = env.step(buffer_bound)

                a_batch.append([buffer_bound])
                r_batch.append(rew)

                if done:
                    break
            v_batch = actor.compute_v(s_batch, a_batch, r_batch, done)
            exp_queue.put([s_batch, a_batch, v_batch])

            actor_net_params = net_params_queue.get()
            actor.set_network_params(actor_net_params)
コード例 #2
0
ファイル: train_sac.py プロジェクト: youngboy52/Pensieve-PPO
def agent(agent_id, net_params_queue, exp_queue):
    env = ABREnv(agent_id)
    with tf.Session() as sess, open(
            SUMMARY_DIR + '/log_agent_' + str(agent_id), 'w') as log_file:
        actor = Network(sess,
                        state_dim=S_DIM,
                        action_dim=A_DIM,
                        learning_rate=ACTOR_LR_RATE,
                        name='hehe')

        # initial synchronization of the network parameters from the coordinator
        net_params = net_params_queue.get()
        actor.set_network_params(net_params)

        time_stamp = 0

        for epoch in range(TRAIN_EPOCH):
            obs = env.reset()
            s_batch, a_batch, r_batch, done_batch, entropy_batch = [], [], [], [], []
            for _ in range(TRAIN_SEQ_LEN):
                s_batch.append(obs)

                action_prob = actor.get_action_prob(
                    np.reshape(obs, (1, S_DIM[0], S_DIM[1])))

                action_cumsum = np.cumsum(action_prob)
                bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) /
                            float(RAND_RANGE)).argmax()

                entropy = -np.dot(action_prob, np.log(action_prob))
                obs, rew, done, info = env.step(bit_rate)

                action_vec = np.zeros(A_DIM)
                action_vec[bit_rate] = 1
                a_batch.append(action_vec)
                r_batch.append(rew)
                done_batch.append(done)
                entropy_batch.append(entropy)
                if done:
                    break
            # v_batch, td_target = actor.compute_v(s_batch, a_batch, r_batch, done)
            exp_queue.put(
                [s_batch, a_batch, r_batch, done_batch, entropy_batch])

            actor_net_params = net_params_queue.get()
            actor.set_network_params(actor_net_params)
コード例 #3
0
def agent(agent_id, net_params_queue, exp_queue):
    env = ABREnv(agent_id)
    with tf.Session() as sess, open(
            SUMMARY_DIR + '/log_agent_' + str(agent_id), 'w') as log_file:
        actor = network.Network(sess,
                                state_dim=S_DIM,
                                action_dim=A_DIM,
                                learning_rate=ACTOR_LR_RATE)

        # initial synchronization of the network parameters from the coordinator
        actor_net_params = net_params_queue.get()
        actor.set_network_params(actor_net_params)

        time_stamp = 0

        for epoch in range(TRAIN_EPOCH):
            obs = env.reset()
            s_batch, a_batch, p_batch, r_batch = [], [], [], []
            for step in range(TRAIN_SEQ_LEN):
                s_batch.append(obs)

                action_prob = actor.predict(
                    np.reshape(obs, (1, S_DIM[0], S_DIM[1])))

                #action_cumsum = np.cumsum(action_prob)
                #bit_rate = (action_cumsum > np.random.randint(
                #    1, RAND_RANGE) / float(RAND_RANGE)).argmax()
                # gumbel noise
                noise = np.random.gumbel(size=len(action_prob))
                bit_rate = np.argmax(np.log(action_prob) + noise)

                obs, rew, done, info = env.step(bit_rate)

                action_vec = np.zeros(A_DIM)
                action_vec[bit_rate] = 1
                a_batch.append(action_vec)
                r_batch.append(rew)
                p_batch.append(action_prob)
                if done:
                    break
            v_batch = actor.compute_v(s_batch, a_batch, r_batch, done)
            exp_queue.put([s_batch, a_batch, p_batch, v_batch])

            actor_net_params = net_params_queue.get()
            actor.set_network_params(actor_net_params)
コード例 #4
0
ファイル: train.py プロジェクト: zchao520/Zwei
def agent(agent_id, net_params_queue, exp_queue):
    env = ABREnv(agent_id)
    with tf.Session() as sess, open(SUMMARY_DIR + '/log_agent_' + str(agent_id), 'w') as log_file:
        actor = network.Network(sess,
                                state_dim=S_DIM, action_dim=A_DIM,
                                learning_rate=ACTOR_LR_RATE)

        # initial synchronization of the network parameters from the coordinator
        actor_net_params = net_params_queue.get()
        actor.set_network_params(actor_net_params)

        time_stamp = 0
        obs = env.reset()
        # env.reset()
        for epoch in range(TRAIN_EPOCH):
            env.reset_trace()
            tmp_buffer = []
            for i in range(BATTLE_ROUND):
                obs = env.reset()
                s_batch, a_batch, p_batch, bitrate_batch, rebuffer_batch = [], [], [], [], []
                for step in range(TRAIN_SEQ_LEN):
                    s_batch.append(obs)
                    action_prob = actor.predict(
                        np.reshape(obs, (1, S_DIM[0], S_DIM[1])))
                    
                    action_cumsum = np.cumsum(action_prob)
                    bit_rate = (action_cumsum > np.random.randint(
                        1, RAND_RANGE) / float(RAND_RANGE)).argmax()
                    obs, rew, done, info = env.step(bit_rate)

                    action_vec = np.zeros(A_DIM)
                    action_vec[bit_rate] = 1
                    a_batch.append(action_vec)
                    p_batch.append(action_prob)

                    bitrate_batch.append(info['bitrate'])
                    rebuffer_batch.append(info['rebuffer'])
                    if done:
                        break
                tmp_buffer.append(
                    [s_batch, a_batch, p_batch, bitrate_batch, rebuffer_batch])
            s, a, p, g = [], [], [], []
            for i in range(BATTLE_ROUND):
                w_arr = []
                for j in range(BATTLE_ROUND):
                    if i != j:
                        tmp_agent_results = []
                        # i
                        s_batch, a_batch, p_batch, bitrate_batch, rebuffer_batch = tmp_buffer[i]
                        bit_rate_ = np.mean(bitrate_batch)
                        rebuffer_ = np.mean(rebuffer_batch)
                        smoothness_ = np.mean(np.abs(np.diff(bitrate_batch)))
                        tmp_agent_results.append([bit_rate_, rebuffer_, smoothness_])
                        # j
                        s_batch, a_batch, p_batch, bitrate_batch, rebuffer_batch = tmp_buffer[j]
                        bit_rate_ = np.mean(bitrate_batch)
                        rebuffer_ = np.mean(rebuffer_batch)
                        smoothness_ = np.mean(np.abs(np.diff(bitrate_batch)))
                        tmp_agent_results.append([bit_rate_, rebuffer_, smoothness_])
                        # battle
                        w_rate_imm = rules.rules(tmp_agent_results)[0]
                        w_arr.append(w_rate_imm)
                w_rate = np.sum(w_arr) / len(w_arr)
                s_batch, a_batch, p_batch, bitrate_batch, rebuffer_batch = tmp_buffer[i]
                # Policy invariance under reward 
                for s_, a_, p_ in zip(s_batch, a_batch, p_batch):
                    s.append(s_)
                    a.append(a_)
                    p.append(p_)
                    g.append([w_rate])
            exp_queue.put([s, a, p, g])

            actor_net_params = net_params_queue.get()
            actor.set_network_params(actor_net_params)