def __init__(self, sess):
        self.sess = sess
        self.actor = a3c.ActorNetwork(self.sess,
                                      state_dim=S_INFO,
                                      action_dim=A_DIM,
                                      learning_rate=ACTOR_LR_RATE)
        self.critic = a3c.CriticNetwork(self.sess,
                                        state_dim=S_INFO,
                                        learning_rate=CRITIC_LR_RATE)

        self.summary_ops, self.summary_vars = a3c.build_summaries()

        self.sess.run(tf.global_variables_initializer())
        self.writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph)
        self.saver = tf.train.Saver()

        # restore neural network
        if NN_MODEL is not None:
            print("load model success!")
            self.saver.restore(self.sess, NN_MODEL)

        self.epoch = 0
        self.i_episode = 0
        self.total_reward = 0.0

        self.s = env.reset()
Esempio n. 2
0
    def __init__(self, sess, a_dims, s_lengths, nn_model):
        '''
        Initialize the learner.

        :a_dim: array containing the dimension space for each action
        :s_info: the number of different metric information
        :s_lengths: array containing the length of each metric information
        '''
        if not os.path.exists(SUMMARY_DIR):
            os.makedirs(SUMMARY_DIR)

        self.sess = sess
        self.a_dims = a_dims
        self.s_lengths = s_lengths

        self.s_dims = len(self.s_lengths), max(self.s_lengths)

        self.actor = a3c.ActorNetwork(self.sess, self.a_dims, self.s_lengths,
                                      ACTOR_LR_RATE)
        self.critic = a3c.CriticNetwork(self.sess, self.a_dims, self.s_lengths,
                                        CRITIC_LR_RATE)

        self.summary_ops, self.summary_vars = a3c.build_summaries()

        self.sess.run(tf.global_variables_initializer())
        self.writer = tf.summary.FileWriter(SUMMARY_DIR, self.sess.graph)
        self.saver = tf.train.Saver()

        if nn_model is not None:
            saver.restore(nn_model)
            logging.info('Model restored.')

        self.entropy_record = list()
Esempio n. 3
0
    def __init__(self, checkpoint):
        self.sess = tf.Session()

        self.actor = a3c.ActorNetwork(self.sess,
                                      state_dim=[S_INFO, S_LEN],
                                      action_dim=A_DIM,
                                      learning_rate=ACTOR_LR_RATE)

        summary_ops, summary_vars = a3c.build_summaries()

        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver()  # save neural net parameters

        # restore neural net parameters
        self.saver.restore(self.sess, checkpoint)
Esempio n. 4
0
def central_agent(net_params_queues, exp_queues):

    assert len(net_params_queues) == NUM_AGENTS
    assert len(exp_queues) == NUM_AGENTS

    logging.basicConfig(filename=LOG_FILE + '_central',
                        filemode='w',
                        level=logging.INFO)

    with tf.Session() as sess, open(LOG_FILE + '_test', 'w') as test_log_file:

        actor = a3c.ActorNetwork(sess,
                                 state_dim=[S_INFO, S_LEN], action_dim=A_DIM,
                                 learning_rate=ACTOR_LR_RATE)
        critic = a3c.CriticNetwork(sess,
                                   state_dim=[S_INFO, S_LEN],
                                   learning_rate=CRITIC_LR_RATE)

        summary_ops, summary_vars = a3c.build_summaries()

        sess.run(tf.global_variables_initializer())
        writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph)  # training monitor
        saver = tf.train.Saver(max_to_keep=50000)  # save neural net parameters

        # restore neural net parameters
        nn_model = NN_MODEL
        if nn_model is not None:  # nn_model is the path to file
            saver.restore(sess, nn_model)
		
            print("Model restored.")

        epoch = 0

        # assemble experiences from agents, compute the gradients
        while epoch <= num_epochs:
            # synchronize the network parameters of work agent
            actor_net_params = actor.get_network_params()
            critic_net_params = critic.get_network_params()
            for i in range(NUM_AGENTS):
                net_params_queues[i].put([actor_net_params, critic_net_params])
                # Note: this is synchronous version of the parallel training,
                # which is easier to understand and probe. The framework can be
                # fairly easily modified to support asynchronous training.
                # Some practices of asynchronous training (lock-free SGD at
                # its core) are nicely explained in the following two papers:
                # https://arxiv.org/abs/1602.01783
                # https://arxiv.org/abs/1106.5730

            # record average reward and td loss change
            # in the experiences from the agents
            total_batch_len = 0.0
            total_reward = 0.0
            total_td_loss = 0.0
            total_entropy = 0.0
            total_agents = 0.0 

            # assemble experiences from the agents
            actor_gradient_batch = []
            critic_gradient_batch = []

            for i in range(NUM_AGENTS):
                s_batch, a_batch, r_batch, terminal, info = exp_queues[i].get()

                actor_gradient, critic_gradient, td_batch = \
                    a3c.compute_gradients(
                        s_batch=np.stack(s_batch, axis=0),
                        a_batch=np.vstack(a_batch),
                        r_batch=np.vstack(r_batch),
                        terminal=terminal, actor=actor, critic=critic)

                actor_gradient_batch.append(actor_gradient)
                critic_gradient_batch.append(critic_gradient)

                total_reward += np.sum(r_batch)
                total_td_loss += np.sum(td_batch)
                total_batch_len += len(r_batch)
                total_agents += 1.0
                total_entropy += np.sum(info['entropy'])

            # compute aggregated gradient
            assert NUM_AGENTS == len(actor_gradient_batch)
            assert len(actor_gradient_batch) == len(critic_gradient_batch)
            # assembled_actor_gradient = actor_gradient_batch[0]
            # assembled_critic_gradient = critic_gradient_batch[0]
            # for i in range(len(actor_gradient_batch) - 1):
            #     for j in range(len(assembled_actor_gradient)):
            #             assembled_actor_gradient[j] += actor_gradient_batch[i][j]
            #             assembled_critic_gradient[j] += critic_gradient_batch[i][j]
            # actor.apply_gradients(assembled_actor_gradient)
            # critic.apply_gradients(assembled_critic_gradient)
            for i in range(len(actor_gradient_batch)):
                actor.apply_gradients(actor_gradient_batch[i])
                critic.apply_gradients(critic_gradient_batch[i])

            # log training information
            epoch += 1
            avg_reward = total_reward / total_agents
            avg_td_loss = total_td_loss / total_batch_len
            avg_entropy = total_entropy / total_batch_len

            logging.info('Epoch: ' + str(epoch) +
                         ' TD_loss: ' + str(avg_td_loss) +
                         ' Avg_reward: ' + str(avg_reward) +
                         ' Avg_entropy: ' + str(avg_entropy))

            summary_str = sess.run(summary_ops, feed_dict={
                summary_vars[0]: avg_td_loss,
                summary_vars[1]: avg_reward,
                summary_vars[2]: avg_entropy
            })

            writer.add_summary(summary_str, epoch)
            writer.flush()

            if epoch % MODEL_SAVE_INTERVAL == 0:
                # Save the neural net parameters to disk.
                save_path = saver.save(sess, SUMMARY_DIR + "/nn_model_ep_" +
                                       str(epoch) + ".ckpt")
                logging.info("Model saved in file: " + save_path)
                testing(
                    epoch,
                    SUMMARY_DIR + "/nn_model_ep_" + str(epoch) + ".ckpt", 
                    test_log_file
                )
Esempio n. 5
0
def main():
    # utility_offset = -math.log(VIDEO_BIT_RATE[0])  # so utilities[0] = 0
    # utilities = [math.log(b) + utility_offset for b in VIDEO_BIT_RATE]

    np.random.seed(RANDOM_SEED)

    assert len(VIDEO_BIT_RATE) == A_DIM

    all_cooked_time, all_cooked_bw, _ = load_trace.load_trace()
    load_trace.plot_bandwidth(all_cooked_time, all_cooked_bw, _)
    if not os.path.exists(SUMMARY_DIR):
        os.makedirs(SUMMARY_DIR)

    net_env = env.Environment(all_cooked_time=all_cooked_time,
                              all_cooked_bw=all_cooked_bw)

    with tf.Session() as sess, open(LOG_FILE, 'w') as log_file:

        actor = a3c.ActorNetwork(sess,
                                 state_dim=[S_INFO, S_LEN],
                                 action_dim=A_DIM,
                                 learning_rate=ACTOR_LR_RATE)

        critic = a3c.CriticNetwork(sess,
                                   state_dim=[S_INFO, S_LEN],
                                   learning_rate=CRITIC_LR_RATE)

        summary_ops, summary_vars = a3c.build_summaries()

        sess.run(tf.global_variables_initializer())
        writer = tf.summary.FileWriter(SUMMARY_DIR,
                                       sess.graph)  # training monitor
        saver = tf.train.Saver()  # save neural net parameters

        # restore neural net parameters
        nn_model = NN_MODEL
        if nn_model is not None:  # nn_model is the path to file
            saver.restore(sess, nn_model)
            print("Model restored.")

        epoch = 0
        time_stamp = 0

        last_bit_rate = DEFAULT_QUALITY
        bit_rate = DEFAULT_QUALITY

        action_vec = np.zeros(A_DIM)
        action_vec[bit_rate] = 1

        s_batch = [np.zeros((S_INFO, S_LEN))]
        a_batch = [action_vec]
        r_batch = []
        entropy_record = []

        actor_gradient_batch = []
        critic_gradient_batch = []

        while True:  # serve video forever
            # the action is from the last decision
            # this is to make the framework similar to the real
            delay, sleep_time, buffer_size, rebuf, \
            video_chunk_size, next_video_chunk_sizes, \
            end_of_video, video_chunk_counter,throughput,video_chunk_remain = \
                net_env.get_video_chunk(bit_rate)
            #print(net_env.get_video_chunk(bit_rate))
            time_stamp += delay  # in ms
            time_stamp += sleep_time  # in ms

            # reward is video quality - rebuffer penalty - smooth penalty
            reward = VIDEO_BIT_RATE[bit_rate] / M_IN_K \
                     - REBUF_PENALTY * rebuf \
                     - SMOOTH_PENALTY * np.abs(VIDEO_BIT_RATE[bit_rate] -
                                               VIDEO_BIT_RATE[last_bit_rate]) / M_IN_K
            r_batch.append(reward)

            last_bit_rate = bit_rate

            # retrieve previous state
            if len(s_batch) == 0:
                state = [np.zeros((S_INFO, S_LEN))]
            else:
                state = np.array(s_batch[-1], copy=True)
            # print(state)

            # dequeue history record
            state = np.roll(state, -1, axis=1)

            # this should be S_INFO number of terms
            state[0, -1] = VIDEO_BIT_RATE[bit_rate] / float(
                np.max(VIDEO_BIT_RATE))  # last quality
            state[1, -1] = buffer_size / BUFFER_NORM_FACTOR  # 10 sec
            state[2, -1] = float(video_chunk_size) / float(
                delay) / M_IN_K  # kilo byte / ms
            state[3, -1] = float(delay) / M_IN_K / BUFFER_NORM_FACTOR  # 10 sec
            state[4, :A_DIM] = np.array(
                next_video_chunk_sizes) / M_IN_K / M_IN_K  # mega byte
            state[5, -1] = np.minimum(
                video_chunk_remain,
                CHUNK_TIL_VIDEO_END_CAP) / float(CHUNK_TIL_VIDEO_END_CAP)

            # print('state',state)
            action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN)))
            action_cumsum = np.cumsum(action_prob)
            rand = np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)
            print(action_cumsum, action_cumsum > rand,
                  (action_cumsum > rand).argmax())
            # print(action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE))
            # print(action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax()

            #compute Vp and map bitrate
            # bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax()

            Vp_index = (action_cumsum > np.random.randint(1, RAND_RANGE) /
                        float(RAND_RANGE)).argmax()
            Vp = BUFFER_PARAMETER[Vp_index]
            # Note: we need to discretize the probability into 1/RAND_RANGE steps,
            # because there is an intrinsic discrepancy in passing single state and batch states

            config = {
                'buffer_size': env.BUFFER_THRESH,
                'gp': GP,
                'Vp': Vp,
                'abr_osc': False,
                'abr_basic': False,
                'no_ibr': False
            }
            bola = get_bitrate.Bola(config=config)
            bit_rate = bola.get_quality(
                Vp, buffer_size * env.MILLISECONDS_IN_SECOND, last_bit_rate,
                throughput)

            #决策前的信息
            print(
                '[%d]:download time %.2fms,thrput=%.2f,chunk size %d,buffer=%.2fs,bitrate=%d'
                % (video_chunk_counter, throughput, delay, video_chunk_size,
                   buffer_size, last_bit_rate))

            entropy_record.append(a3c.compute_entropy(action_prob[0]))

            # log time_stamp, bit_rate, buffer_size, reward
            log_file.write(
                str(time_stamp) + '\t' + str(VIDEO_BIT_RATE[bit_rate]) + '\t' +
                str(buffer_size) + '\t' + str(rebuf) + '\t' +
                str(video_chunk_size) + '\t' + str(delay) + '\t' +
                str(reward) + '\n')
            log_file.flush()

            if len(r_batch
                   ) >= TRAIN_SEQ_LEN or end_of_video:  # do training once

                actor_gradient, critic_gradient, td_batch = \
                    a3c.compute_gradients(s_batch=np.stack(s_batch[1:], axis=0),  # ignore the first chuck
                                          a_batch=np.vstack(a_batch[1:]),  # since we don't have the
                                          r_batch=np.vstack(r_batch[1:]),  # control over it
                                          terminal=end_of_video, actor=actor, critic=critic)
                td_loss = np.mean(td_batch)

                actor_gradient_batch.append(actor_gradient)
                critic_gradient_batch.append(critic_gradient)

                print("====")
                print("Epoch", epoch)
                print("TD_loss", td_loss, "Avg_reward", np.mean(r_batch),
                      "Avg_entropy", np.mean(entropy_record))
                print("====")

                summary_str = sess.run(summary_ops,
                                       feed_dict={
                                           summary_vars[0]: td_loss,
                                           summary_vars[1]: np.mean(r_batch),
                                           summary_vars[2]:
                                           np.mean(entropy_record)
                                       })

                writer.add_summary(summary_str, epoch)
                writer.flush()

                entropy_record = []

                if len(actor_gradient_batch) >= GRADIENT_BATCH_SIZE:

                    assert len(actor_gradient_batch) == len(
                        critic_gradient_batch)
                    # assembled_actor_gradient = actor_gradient_batch[0]
                    # assembled_critic_gradient = critic_gradient_batch[0]
                    # assert len(actor_gradient_batch) == len(critic_gradient_batch)
                    # for i in xrange(len(actor_gradient_batch) - 1):
                    #     for j in xrange(len(actor_gradient)):
                    #         assembled_actor_gradient[j] += actor_gradient_batch[i][j]
                    #         assembled_critic_gradient[j] += critic_gradient_batch[i][j]
                    # actor.apply_gradients(assembled_actor_gradient)
                    # critic.apply_gradients(assembled_critic_gradient)

                    for i in range(len(actor_gradient_batch)):
                        actor.apply_gradients(actor_gradient_batch[i])
                        critic.apply_gradients(critic_gradient_batch[i])

                    actor_gradient_batch = []
                    critic_gradient_batch = []

                    epoch += 1
                    if epoch % MODEL_SAVE_INTERVAL == 0:
                        # Save the neural net parameters to disk.
                        save_path = saver.save(
                            sess, SUMMARY_DIR + "/nn_model_ep_" + str(epoch) +
                            ".ckpt")
                        print("Model saved in file: %s" % save_path)

                del s_batch[:]
                del a_batch[:]
                del r_batch[:]

            if end_of_video:
                last_bit_rate = DEFAULT_QUALITY
                bit_rate = DEFAULT_QUALITY  # use the default action here

                action_vec = np.zeros(A_DIM)
                action_vec[bit_rate] = 1

                s_batch.append(np.zeros((S_INFO, S_LEN)))
                a_batch.append(action_vec)

            else:
                s_batch.append(state)

                action_vec = np.zeros(A_DIM)
                # print(bit_rate)
                action_vec[bit_rate] = 1
                a_batch.append(action_vec)
Esempio n. 6
0
def central_agent(net_params_queues, exp_queues):

    assert len(net_params_queues) == NUM_AGENTS
    assert len(exp_queues) == NUM_AGENTS

    with tf.Session() as sess, open(SUMMARY_DIR + '/log_central',
                                    'w') as log_file:

        actor = a3c.ActorNetwork(sess,
                                 state_dim=S_DIM,
                                 action_dim=A_DIM,
                                 learning_rate=ACTOR_LR_RATE)
        critic = a3c.CriticNetwork(sess,
                                   state_dim=S_DIM,
                                   learning_rate=CRITIC_LR_RATE)

        summary_ops, summary_vars = a3c.build_summaries()

        sess.run(tf.global_variables_initializer())
        writer = tf.summary.FileWriter(SUMMARY_DIR,
                                       sess.graph)  # training monitor
        saver = tf.train.Saver()  # save neural net parameters

        # restore neural net parameters
        nn_model = NN_MODEL
        if nn_model is not None:  # nn_model is the path to file
            saver.restore(sess, nn_model)
            print("Model restored.")

        # while True:  # assemble experiences from agents, compute the gradients
        for ep in range(TRAIN_EPOCH):
            # synchronize the network parameters of work agent
            actor_net_params = actor.get_network_params()
            critic_net_params = critic.get_network_params()
            for i in range(NUM_AGENTS):
                net_params_queues[i].put([actor_net_params, critic_net_params])

            # record average reward and td loss change
            # in the experiences from the agents
            total_batch_len = 0.0
            total_reward = 0.0
            total_td_loss = 0.0
            total_agents = 0.0

            # assemble experiences from the agents
            actor_gradient_batch = []
            critic_gradient_batch = []

            for i in range(NUM_AGENTS):
                s_batch, a_batch, r_batch, terminal = exp_queues[i].get()

                actor_gradient, critic_gradient, td_batch = \
                    a3c.compute_gradients(
                        # s_batch=np.vstack(s_batch),
                        s_batch=np.stack(s_batch, axis=0),
                        a_batch=np.vstack(a_batch),
                        r_batch=np.vstack(r_batch),
                        terminal=terminal, actor=actor, critic=critic)

                actor_gradient_batch.append(actor_gradient)
                critic_gradient_batch.append(critic_gradient)

                total_reward += np.sum(r_batch)
                total_td_loss += np.sum(td_batch)
                total_batch_len += len(r_batch)
                total_agents += 1.0

            # compute aggregated gradient
            assert NUM_AGENTS == len(actor_gradient_batch)
            assert len(actor_gradient_batch) == len(critic_gradient_batch)

            for i in range(len(actor_gradient_batch)):
                actor.apply_gradients(actor_gradient_batch[i])
                critic.apply_gradients(critic_gradient_batch[i])

            # log training information
            avg_reward = total_reward / total_agents
            avg_td_loss = total_td_loss / total_batch_len

            log_file.write('Epoch: ' + str(ep) + ' TD_loss: ' +
                           str(avg_td_loss) + ' Avg_reward: ' +
                           str(avg_reward) + '\n')
            log_file.flush()

            summary_str = sess.run(summary_ops,
                                   feed_dict={
                                       summary_vars[0]: avg_td_loss,
                                       summary_vars[1]: avg_reward
                                   })

            writer.add_summary(summary_str, ep)
            writer.flush()

            if ep % MODEL_SAVE_INTERVAL == 0:
                # Save the neural net parameters to disk.
                save_path = saver.save(
                    sess, MODEL_DIR + "/nn_model_ep_" + str(ep) + ".ckpt")
Esempio n. 7
0
def central_agent(net_params_queues, exp_queues):

    assert len(net_params_queues) == NUM_AGENTS
    assert len(exp_queues) == NUM_AGENTS

    logging.basicConfig(filename=LOG_FILE + '_central',
                        filemode='w',
                        level=logging.INFO)

    with tf.Session() as sess, open(LOG_FILE + '_test', 'wb') as test_log_file:

        actor = a3c.ActorNetwork(sess,
                                 state_dim=[S_INFO, S_LEN],
                                 action_dim=A_DIM,
                                 learning_rate=ACTOR_LR_RATE)
        critic = a3c.CriticNetwork(sess,
                                   state_dim=[S_INFO, S_LEN],
                                   learning_rate=CRITIC_LR_RATE)

        summary_ops, summary_vars = a3c.build_summaries()

        sess.run(tf.global_variables_initializer())
        writer = tf.summary.FileWriter(SUMMARY_DIR,
                                       sess.graph)  # training monitor
        saver = tf.train.Saver(max_to_keep=10000)  # save neural net parameters

        # restore neural net parameters
        nn_model = NN_MODEL
        if nn_model == "None":
            epoch = 0
            nn_model = None
        if nn_model is not None:  # nn_model is the path to file
            epoch = int(nn_model.replace("nn_model_ep_", "").split(".ckpt")[0])
            saver.restore(sess, MODEL_DIR + nn_model)
            print("Model restored.")

        # while True:  # assemble experiences from agents, compute the gradients
        while True:
            # synchronize the network parameters of work agent
            actor_net_params = actor.get_network_params()
            critic_net_params = critic.get_network_params()
            for i in xrange(NUM_AGENTS):
                net_params_queues[i].put([actor_net_params, critic_net_params])

            # record average reward and td loss change
            # in the experiences from the agents
            total_batch_len = 0.0
            total_reward = 0.0
            total_td_loss = 0.0
            total_entropy = 0.0
            total_agents = 0.0

            # assemble experiences from the agents
            actor_gradient_batch = []
            critic_gradient_batch = []

            for i in xrange(NUM_AGENTS):
                s_batch, a_batch, r_batch, terminal, info = exp_queues[i].get()

                actor_gradient, critic_gradient, td_batch = \
                    a3c.compute_gradients(
                        s_batch=np.stack(s_batch, axis=0),
                        a_batch=np.vstack(a_batch),
                        r_batch=np.vstack(r_batch),
                        terminal=terminal, actor=actor, critic=critic)

                for i in xrange(len(actor_gradient)):
                    assert np.any(np.isnan(actor_gradient[i])) == False

                actor_gradient_batch.append(actor_gradient)
                critic_gradient_batch.append(critic_gradient)

                total_reward += np.sum(r_batch)
                total_td_loss += np.sum(td_batch)
                total_batch_len += len(r_batch)
                total_agents += 1.0
                total_entropy += np.sum(info['entropy'])

            # compute aggregated gradient
            assert NUM_AGENTS == len(actor_gradient_batch)
            assert len(actor_gradient_batch) == len(critic_gradient_batch)
            # assembled_actor_gradient = actor_gradient_batch[0]
            # assembled_critic_gradient = critic_gradient_batch[0]
            # for i in xrange(len(actor_gradient_batch) - 1):
            #     for j in xrange(len(assembled_actor_gradient)):
            #             assembled_actor_gradient[j] += actor_gradient_batch[i][j]
            #             assembled_critic_gradient[j] += critic_gradient_batch[i][j]
            # actor.apply_gradients(assembled_actor_gradient)
            # critic.apply_gradients(assembled_critic_gradient)
            for i in xrange(len(actor_gradient_batch)):
                actor.apply_gradients(actor_gradient_batch[i])
                critic.apply_gradients(critic_gradient_batch[i])

            # log training information
            epoch += 1
            avg_reward = total_reward / total_agents
            avg_td_loss = total_td_loss / total_batch_len
            avg_entropy = total_entropy / total_batch_len

            logging.info('Epoch: ' + str(epoch) + ' TD_loss: ' +
                         str(avg_td_loss) + ' Avg_reward: ' + str(avg_reward) +
                         ' Avg_entropy: ' + str(avg_entropy))

            summary_str = sess.run(summary_ops,
                                   feed_dict={
                                       summary_vars[0]: avg_td_loss,
                                       summary_vars[1]: avg_reward,
                                       summary_vars[2]: avg_entropy
                                   })

            writer.add_summary(summary_str, epoch)
            writer.flush()

            if epoch % MODEL_SAVE_INTERVAL == 0:
                # Save the neural net parameters to disk.
                save_path = saver.save(
                    sess, MODEL_DIR + "nn_model_ep_" + str(epoch) + ".ckpt")
                logging.info("Model saved in file: " + save_path)
                testing(epoch,
                        MODEL_DIR + "nn_model_ep_" + str(epoch) + ".ckpt",
                        test_log_file)
def central_agent(net_params_queues, exp_queues):  # 参数是两个有16个队列(进程队列?)的列表

    #打开Session(){
    #    生成神经网络
    #    生成一个tf.summary???(好像是用来检测数据作可视化用的)
    #    初始化神经网络参数,读取已保存的神经网络
    #    循环{
    #        在Queue中放入神经网络参数*子agent数量
    #        初始化变量和batch[]
    #        从Queue获取子agent传来的batch[]数据,综合以后执行梯度下降Optimizer
    #        将数据写入文件
    #        达到一定次数更新一次保存的神经网络
    #    }
    #}
    assert len(net_params_queues) == NUM_AGENTS
    assert len(exp_queues) == NUM_AGENTS

    logging.basicConfig(filename=LOG_FILE + '_central',
                        filemode='w',
                        level=logging.INFO)  # 创建日志?

    with tf.Session() as sess, open(LOG_FILE + '_test', 'wb') as test_log_file:

        # 创建actor神经网络,参数为tensorflow的Session,[输入神经元个数,历史带宽长度],输出神经元个数(码率范围),学习率
        actor = a3c.ActorNetwork(sess,
                                 state_dim=[S_INFO, S_LEN],
                                 action_dim=A_DIM,
                                 learning_rate=ACTOR_LR_RATE)
        # 创建critic神经网络,参数为tensorflow的Session,[输入神经元个数,历史带宽长度],学习率
        critic = a3c.CriticNetwork(sess,
                                   state_dim=[S_INFO, S_LEN],
                                   learning_rate=CRITIC_LR_RATE)

        summary_ops, summary_vars = a3c.build_summaries()  # 总结什么?

        sess.run(tf.global_variables_initializer())
        writer = tf.summary.FileWriter(SUMMARY_DIR,
                                       sess.graph)  # training monitor
        saver = tf.train.Saver()  # save neural net parameters

        # restore neural net parameters
        nn_model = NN_MODEL
        if nn_model is not None:  # nn_model is the path to file
            saver.restore(sess, nn_model)
            print("Model restored.")

        epoch = 0

        # assemble experiences from agents, compute the gradients
        while True:
            # synchronize同步 the network parameters of work agent
            actor_net_params = actor.get_network_params()
            critic_net_params = critic.get_network_params()
            for i in xrange(NUM_AGENTS):  # 0-15
                net_params_queues[i].put([actor_net_params, critic_net_params
                                          ])  # 将参数放入列表中每个进程对应的队列
                # Note: this is synchronous version of the parallel training,
                # which is easier to understand and probe. The framework can be
                # fairly easily modified to support asynchronous training.
                # Some practices of asynchronous training (lock-free SGD at
                # its core) are nicely explained in the following two papers:
                # https://arxiv.org/abs/1602.01783
                # https://arxiv.org/abs/1106.5730

            # record average reward and td loss change
            # in the experiences from the agents
            total_batch_len = 0.0
            total_reward = 0.0
            total_td_loss = 0.0
            total_entropy = 0.0
            total_agents = 0.0

            # assemble experiences from the agents
            actor_gradient_batch = []
            critic_gradient_batch = []

            for i in xrange(NUM_AGENTS):  # 0-15
                s_batch, a_batch, r_batch, terminal, info = exp_queues[i].get(
                )  # 从列表中每个进程对应的队列取出参数?

                actor_gradient, critic_gradient, td_batch = \
                    a3c.compute_gradients( # 计算梯度?
                        s_batch=np.stack(s_batch, axis=0),
                        a_batch=np.vstack(a_batch),
                        r_batch=np.vstack(r_batch),
                        terminal=terminal, actor=actor, critic=critic)

                actor_gradient_batch.append(actor_gradient)
                critic_gradient_batch.append(critic_gradient)

                total_reward += np.sum(r_batch)
                total_td_loss += np.sum(td_batch)
                total_batch_len += len(r_batch)
                total_agents += 1.0
                total_entropy += np.sum(info['entropy'])  # 从info字典中取出熵值

            # compute aggregated汇总 gradient
            assert NUM_AGENTS == len(actor_gradient_batch)
            assert len(actor_gradient_batch) == len(critic_gradient_batch)
            # assembled_actor_gradient = actor_gradient_batch[0]
            # assembled_critic_gradient = critic_gradient_batch[0]
            # for i in xrange(len(actor_gradient_batch) - 1):
            #     for j in xrange(len(assembled_actor_gradient)):
            #             assembled_actor_gradient[j] += actor_gradient_batch[i][j]
            #             assembled_critic_gradient[j] += critic_gradient_batch[i][j]
            # actor.apply_gradients(assembled_actor_gradient)
            # critic.apply_gradients(assembled_critic_gradient)
            for i in xrange(len(actor_gradient_batch)):
                actor.apply_gradients(actor_gradient_batch[i])
                critic.apply_gradients(critic_gradient_batch[i])

            # log training information
            epoch += 1
            avg_reward = total_reward / total_agents
            avg_td_loss = total_td_loss / total_batch_len
            avg_entropy = total_entropy / total_batch_len

            logging.info('Epoch: ' + str(epoch) + ' TD_loss: ' +
                         str(avg_td_loss) + ' Avg_reward: ' + str(avg_reward) +
                         ' Avg_entropy: ' + str(avg_entropy))  # 记录日志

            summary_str = sess.run(summary_ops,
                                   feed_dict={
                                       summary_vars[0]: avg_td_loss,
                                       summary_vars[1]: avg_reward,
                                       summary_vars[2]: avg_entropy
                                   })

            writer.add_summary(summary_str, epoch)
            writer.flush()

            if epoch % MODEL_SAVE_INTERVAL == 0:
                # Save the neural net parameters to disk.
                save_path = saver.save(
                    sess, SUMMARY_DIR + "/nn_model_ep_" + str(epoch) + ".ckpt")
                logging.info("Model saved in file: " + save_path)
                testing(epoch,
                        SUMMARY_DIR + "/nn_model_ep_" + str(epoch) + ".ckpt",
                        test_log_file)  # 测试?
Esempio n. 9
0
def run(port=8333, log_file_path=LOG_FILE):

    np.random.seed(RANDOM_SEED)
    with tf.Session() as sess, open(log_file_path, 'wb') as log_file:

        actor = a3c.ActorNetwork(sess,
                                 state_dim=S_DIM,
                                 action_dim=A_DIM,
                                 learning_rate=ACTOR_LR_RATE)
        critic = a3c.CriticNetwork(sess,
                                   state_dim=S_DIM,
                                   learning_rate=CRITIC_LR_RATE)

        summary_ops, summary_vars = a3c.build_summaries()
        sess.run(tf.initialize_all_variables())
        writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph)
        saver = tf.train.Saver()  # save neural net parameters

        #restore neural net parameters
        nn_model = NN_MODEL
        if nn_model is not None:  # nn_model is the path to file
            saver.restore(sess, nn_model)
            print("Model restored.")

        init_action = np.zeros(A_DIM)
        #by default we simply use the first lambda
        init_action[DEFAULT_LAMBDA] = 0

        s_batch = [np.zeros(S_DIM)]
        a_batch = [init_action]
        r_batch = []
        entropy_record = []  #this is for training

        actor_gradient_batch = []  #this is for training
        critic_gradient_batch = []  #this is for training

        last_lambda = DEFAULT_LAMBDA
        epoch = 0
        end_of_training = False
        # Create a TCP/IP socket
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

        # Bind the socket to the port
        server_address = ('localhost', port)
        print >> sys.stderr, 'starting up on %s port %s' % server_address
        sock.bind(server_address)

        # Listen for incoming connections
        sock.listen(5)
        count = 0

        while True:
            # Wait for a connection
            print >> sys.stderr, 'waiting for a connection'
            connection, addr = sock.accept()
            print 'Connected with ' + addr[0] + ':' + str(addr[1])

            # Receive the json file
            # json file format:
            # 'reward': float
            # 'state': array = '{"state": ["1", "3", "4", ...]}'
            #numBytes = sys.getsizeof(int)
            #print ("size to receive: " + str(numBytes))
            size = connection.recv(4)
            size = struct.unpack('!i', size)[0]
            print >> sys.stderr, 'received "%s"' % size
            data = connection.recv(size)
            jsonData = json.loads(data)
            print jsonData

            #to receive reward
            reward = float(jsonData['reward'])
            if (count > 0):
                r_batch.append(reward)
            else:
                r_batch.append(0.0)

            count = count + 1
            #to receive state
            stateArray = jsonData['state']
            state = np.array(stateArray)
            print(state)
            #to compute action
            action_prob = actor.predict(np.reshape(state, (1, S_DIM)))
            print("action_prob: ")
            print(action_prob)
            action_cumsum = np.cumsum(action_prob)
            print("action_cumsum: ")
            print(action_cumsum)
            print("comparison: ")
            print(action_cumsum >
                  np.random.randint(1, RAND_RANGE) / float(RAND_RANGE))
            selectedLambda = action_prob.argmax()
            #selectedLambda = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax()
            print >> sys.stderr, 'selectedLambda "%s"' % selectedLambda
            #to update entropy
            entropy_record.append(a3c.compute_entropy(action_prob[0]))  #TODO

            #to update and apply gradient
            if len(r_batch) >= TRAIN_SEQ_LEN:
                actor_gradient, critic_gradient, td_batch = \
                   a3c.compute_gradients(s_batch=np.stack(s_batch[1:], axis=0),
                                        a_batch=np.vstack(a_batch[1:]),
                                        r_batch=np.vstack(r_batch[1:]),
                                        terminal=end_of_training, actor=actor, critic=critic)
                td_loss = np.mean(td_batch)

                print("td_loss: ")
                print(td_loss)
                print("actor_gradient: ")
                print(actor_gradient)
                print("critic_gradient: ")
                print(critic_gradient)

                actor_gradient_batch.append(actor_gradient)
                critic_gradient_batch.append(critic_gradient)

                entropy_record = []
                print("len(actor_gradient_batch) = ")
                print len(actor_gradient_batch)
                if len(actor_gradient_batch) >= GRADIENT_BATCH_SIZE:
                    print("GRADIENT_BATCH_SIZE reached")
                    assert len(actor_gradient_batch) == len(
                        critic_gradient_batch)
                    for i in xrange(len(actor_gradient_batch)):
                        print("###################" + str(i) +
                              "###################")
                        print(actor_gradient_batch[i])
                        print(critic_gradient_batch[i])
                        actor.apply_gradients(actor_gradient_batch[i])
                        critic.apply_gradients(critic_gradient_batch[i])

                    actor_gradient_batch = []
                    critic_gradient_batch = []

                    avg_reward = np.mean(r_batch)
                    summary_str = sess.run(summary_ops,
                                           feed_dict={
                                               summary_vars[0]: td_loss,
                                               summary_vars[1]: avg_reward
                                           })

                    writer.add_summary(summary_str, epoch)
                    writer.flush()
                    log_file.write(
                        str(datetime.datetime.now().strftime(
                            '%Y-%m-%d %H:%M:%S')) + '\t' + str(epoch) + '\t' +
                        str(avg_reward) + '\t' + str(td_loss) + '\n')
                    log_file.flush()

                    epoch += 1
                    if epoch % MODEL_SAVE_INTERVAL == 0:
                        # save the neural net parameters to disk.
                        save_path = saver.save(
                            sess, "./nn_model_ep_" + str(epoch) + ".ckpt")
                        print("Model saved in file: %s" % save_path)

                    if epoch == MAX_EPOCH:
                        end_of_training = True

                del s_batch[:]
                del a_batch[:]
                del r_batch[:]

            s_batch.append(state)
            action_vec = np.zeros(A_DIM)
            action_vec[selectedLambda] = 1
            a_batch.append(action_vec)

            #to send back action
            print >> sys.stderr, 'sending data back to the client'
            connection.sendall(struct.pack('!i', selectedLambda))
            last_lambda = selectedLambda
            connection.close()

        sock.close()