def agent(agent_id, net_params_queue, exp_queue): env = ABREnv(agent_id) with tf.Session() as sess, open( SUMMARY_DIR + '/log_agent_' + str(agent_id), 'w') as log_file: actor = network.Network(sess, state_dim=S_DIM, action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) # initial synchronization of the network parameters from the coordinator actor_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) time_stamp = 0 for epoch in range(TRAIN_EPOCH): obs = env.reset() s_batch, a_batch, r_batch = [], [], [] for step in range(TRAIN_SEQ_LEN): s_batch.append(obs) buffer_bound = actor.predict( np.reshape(obs, (1, S_DIM[0], S_DIM[1]))) obs, rew, done, info = env.step(buffer_bound) a_batch.append([buffer_bound]) r_batch.append(rew) if done: break v_batch = actor.compute_v(s_batch, a_batch, r_batch, done) exp_queue.put([s_batch, a_batch, v_batch]) actor_net_params = net_params_queue.get() actor.set_network_params(actor_net_params)
def agent(agent_id, net_params_queue, exp_queue): env = ABREnv(agent_id) with tf.Session() as sess, open( SUMMARY_DIR + '/log_agent_' + str(agent_id), 'w') as log_file: actor = Network(sess, state_dim=S_DIM, action_dim=A_DIM, learning_rate=ACTOR_LR_RATE, name='hehe') # initial synchronization of the network parameters from the coordinator net_params = net_params_queue.get() actor.set_network_params(net_params) time_stamp = 0 for epoch in range(TRAIN_EPOCH): obs = env.reset() s_batch, a_batch, r_batch, done_batch, entropy_batch = [], [], [], [], [] for _ in range(TRAIN_SEQ_LEN): s_batch.append(obs) action_prob = actor.get_action_prob( np.reshape(obs, (1, S_DIM[0], S_DIM[1]))) action_cumsum = np.cumsum(action_prob) bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() entropy = -np.dot(action_prob, np.log(action_prob)) obs, rew, done, info = env.step(bit_rate) action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 a_batch.append(action_vec) r_batch.append(rew) done_batch.append(done) entropy_batch.append(entropy) if done: break # v_batch, td_target = actor.compute_v(s_batch, a_batch, r_batch, done) exp_queue.put( [s_batch, a_batch, r_batch, done_batch, entropy_batch]) actor_net_params = net_params_queue.get() actor.set_network_params(actor_net_params)
def agent(agent_id, net_params_queue, exp_queue): env = ABREnv(agent_id) with tf.Session() as sess, open( SUMMARY_DIR + '/log_agent_' + str(agent_id), 'w') as log_file: actor = network.Network(sess, state_dim=S_DIM, action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) # initial synchronization of the network parameters from the coordinator actor_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) time_stamp = 0 for epoch in range(TRAIN_EPOCH): obs = env.reset() s_batch, a_batch, p_batch, r_batch = [], [], [], [] for step in range(TRAIN_SEQ_LEN): s_batch.append(obs) action_prob = actor.predict( np.reshape(obs, (1, S_DIM[0], S_DIM[1]))) #action_cumsum = np.cumsum(action_prob) #bit_rate = (action_cumsum > np.random.randint( # 1, RAND_RANGE) / float(RAND_RANGE)).argmax() # gumbel noise noise = np.random.gumbel(size=len(action_prob)) bit_rate = np.argmax(np.log(action_prob) + noise) obs, rew, done, info = env.step(bit_rate) action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 a_batch.append(action_vec) r_batch.append(rew) p_batch.append(action_prob) if done: break v_batch = actor.compute_v(s_batch, a_batch, r_batch, done) exp_queue.put([s_batch, a_batch, p_batch, v_batch]) actor_net_params = net_params_queue.get() actor.set_network_params(actor_net_params)
def agent(agent_id, net_params_queue, exp_queue): env = ABREnv(agent_id) with tf.Session() as sess, open(SUMMARY_DIR + '/log_agent_' + str(agent_id), 'w') as log_file: actor = network.Network(sess, state_dim=S_DIM, action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) # initial synchronization of the network parameters from the coordinator actor_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) time_stamp = 0 obs = env.reset() # env.reset() for epoch in range(TRAIN_EPOCH): env.reset_trace() tmp_buffer = [] for i in range(BATTLE_ROUND): obs = env.reset() s_batch, a_batch, p_batch, bitrate_batch, rebuffer_batch = [], [], [], [], [] for step in range(TRAIN_SEQ_LEN): s_batch.append(obs) action_prob = actor.predict( np.reshape(obs, (1, S_DIM[0], S_DIM[1]))) action_cumsum = np.cumsum(action_prob) bit_rate = (action_cumsum > np.random.randint( 1, RAND_RANGE) / float(RAND_RANGE)).argmax() obs, rew, done, info = env.step(bit_rate) action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 a_batch.append(action_vec) p_batch.append(action_prob) bitrate_batch.append(info['bitrate']) rebuffer_batch.append(info['rebuffer']) if done: break tmp_buffer.append( [s_batch, a_batch, p_batch, bitrate_batch, rebuffer_batch]) s, a, p, g = [], [], [], [] for i in range(BATTLE_ROUND): w_arr = [] for j in range(BATTLE_ROUND): if i != j: tmp_agent_results = [] # i s_batch, a_batch, p_batch, bitrate_batch, rebuffer_batch = tmp_buffer[i] bit_rate_ = np.mean(bitrate_batch) rebuffer_ = np.mean(rebuffer_batch) smoothness_ = np.mean(np.abs(np.diff(bitrate_batch))) tmp_agent_results.append([bit_rate_, rebuffer_, smoothness_]) # j s_batch, a_batch, p_batch, bitrate_batch, rebuffer_batch = tmp_buffer[j] bit_rate_ = np.mean(bitrate_batch) rebuffer_ = np.mean(rebuffer_batch) smoothness_ = np.mean(np.abs(np.diff(bitrate_batch))) tmp_agent_results.append([bit_rate_, rebuffer_, smoothness_]) # battle w_rate_imm = rules.rules(tmp_agent_results)[0] w_arr.append(w_rate_imm) w_rate = np.sum(w_arr) / len(w_arr) s_batch, a_batch, p_batch, bitrate_batch, rebuffer_batch = tmp_buffer[i] # Policy invariance under reward for s_, a_, p_ in zip(s_batch, a_batch, p_batch): s.append(s_) a.append(a_) p.append(p_) g.append([w_rate]) exp_queue.put([s, a, p, g]) actor_net_params = net_params_queue.get() actor.set_network_params(actor_net_params)