def run_fourier_sarsa_experiments(transfer_episodes, transfer_epsilon, params): statistics = {"errors": [], "stopping_points": [], "utilities": []} filename = RESULTS_DIRECTORY + "fourier-sarsa-transfer-[{}]-[{}]-[{}].json".format( params["alpha"], params["epsilon"], params["order"]) print("Training on {} with [increment = {}]".format( PROBLEM_DIRECTORY + PROBLEM_FILES[0][0], PROBLEM_FILES[0][1])) metareasoning_env = env.Environment( PROBLEM_DIRECTORY + PROBLEM_FILES[0][0], ALPHA, BETA, PROBLEM_FILES[0][1]) prakhar = fourier_agent.Agent(metareasoning_env, params) prakhar.run_sarsa(statistics) for problem_file in PROBLEM_FILES[1:]: problem_file_path = PROBLEM_DIRECTORY + problem_file[0] increment = problem_file[1] params["episodes"] = transfer_episodes params["epsilon"] = transfer_epsilon print("Shifting to {} with [increment = {}]".format( problem_file_path, increment)) metareasoning_env = env.Environment(problem_file_path, ALPHA, BETA, increment) prakhar = fourier_agent.Agent( metareasoning_env, params, prakhar.function_approximator.weights, prakhar.function_approximator.action_value_function) prakhar.run_sarsa(statistics) utils.save(filename, statistics) return utils.get_results(statistics["errors"], WINDOW_SIZE, PLOT_WINDOW_SIZE)
def main(): np.random.seed(42) os.system('rm ' + TEST_LOG_PATH) ta_q = Tabular_Q() all_cooked_time, all_cooked_bw, _ = load_trace.load_trace() epoch = 0 time_stamp = 0 net_env = env.Environment(all_cooked_time=all_cooked_time, all_cooked_bw=all_cooked_bw) last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY state = [0, 0, 0, 0] while True: delay, sleep_time, buffer_size, rebuf, \ video_chunk_size, next_video_chunk_sizes, \ end_of_video, video_chunk_remain = \ net_env.get_video_chunk(bit_rate) time_stamp += delay # in ms time_stamp += sleep_time # in ms reward = VIDEO_BIT_RATE[bit_rate] / M_IN_K \ - REBUF_PENALTY * rebuf \ - SMOOTH_PENALTY * np.abs(VIDEO_BIT_RATE[bit_rate] - VIDEO_BIT_RATE[last_bit_rate]) / M_IN_K epoch += 1 bw = float(video_chunk_size) / float( delay) / M_IN_K * BITS_IN_BYTE # Mbit/sec bw = min(int(bw / D_BW) * D_BW, BW_MAX) bf = min(int(buffer_size / D_BF) * D_BF, BF_MAX) br = bit_rate c = min(video_chunk_remain, N_CHUNK - 1) next_state = [bw, bf, br, c] ta_q.train_q(state, bit_rate, reward, next_state, end_of_video) state = next_state last_bit_rate = bit_rate bit_rate = ta_q.get_q_action(state) if end_of_video: last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY state = [0, 0, 0, 0] if epoch % TEST_INTERVAL == 0: testing(ta_q, epoch) np.save(TEST_LOG_PATH + '_q_table.npy', ta_q.q_table)
def main(): agents = [] for i in range(NUM_AGENTS): actions = ['defect', 'cooperate'] localRewards = [ np.random.randint(0, high=100), np.random.randint(0, high=100) ] agent = env.CitizenAgent(actions, localRewards, i) agents.append(agent) leader = env.LeaderAgent(agents) testEnv = env.Environment(agents, leader, globalRewardFunc) for _ in range(NUM_EPOCHS): for _ in range(NUM_STEPS): actions = testEnv.getActions() globalReward = testEnv.getRewards(actions) testEnv.updateQ(globalReward) leader.penalize() testEnv.printAgents() #test stage actions = testEnv.getActions() globalReward = testEnv.getRewards(actions) print("Global Reward: " + str(globalReward)) return 0
def setUp(self): self.src = source.StringSource() self.lex = lexer.Lexer(self.src) self.environment = env.Environment() self.parser = pars.Parser(self.lex, self.environment) self.program = interpreter.Interpreter(self.environment, self.parser) self.output = io.StringIO() sys.stdout = self.output
def main(): src = source.StreamSource() fo = open("ThirdExample", "r", encoding='utf-8', newline='\n') environment = env.Environment() src.set_data(fo) lex = lexer.Lexer(src) parser = pars.Parser(lex, environment) inter = interpreter.Interpreter(environment, parser) inter.interpret()
def execute_file(file_name, existing_env=None): if existing_env is None: existing_env = environment.Environment() file_name = existing_env.set_correct_directory(file_name) if not existing_env.is_already_imported(file_name): existing_env.add_import(file_name) try: execute_program(file_to_str(file_name), existing_env) except IOError: print('Could not read file: ' + file_name)
def __init__(self): self._init_hyperparameters() self.env = env.Environment() self.obs_dim = self.env.observation_shape self.act_dim = self.env.action_space.shape[0] self.actor = self.create_model(self.act_dim) self.actor.compile(optimizer=Adam(learning_rate=self.lr)) self.critic = self.create_model(1) self.critic.compile(optimizer=Adam(learning_rate=self.lr)) self.con_mat = tf.eye(self.act_dim)*0.500
def main(): params = { "episodes": 2000, "batch_size": 10, "gamma": 1, "learning_rate": 0.01 } metareasoning_env = env.Environment('problems/test.json', 200, 0.3, 1) agent = Agent(params, metareasoning_env) statistics = {"stopping_points": [], "utilities": []} agent.run_reinforce(statistics)
def __init__(self, random_seed=RANDOM_SEED): np.random.seed(RANDOM_SEED) self.action_space = spaces.Discrete(A_DIM) self.observation_space = spaces.Box(0, 10.0, [S_INFO, S_LEN], dtype=np.float32) all_cooked_time, all_cooked_bw, _ = load_trace.load_trace() self.net_env = env.Environment(all_cooked_time=all_cooked_time, all_cooked_bw=all_cooked_bw) self.last_bit_rate = DEFAULT_QUALITY self.state = np.zeros((S_INFO, S_LEN)) self.reset()
def repl(existing_env=None, get_input=None): """ Read-eval-print loop. Whole programs can be run by using the ':program' directive, ending with ':end'. Use the 'this' keyword to see the current environment frames. """ if existing_env is None: env = environment.Environment() else: env = existing_env if get_input is None: get_input = raw_input brace_matcher = prepare_program.BraceMatcher() while True: brace_matcher.reset() expr = get_input('Capacita> ') expr = expr.strip() if expr == 'exit()': break elif len(expr) == 0: continue elif expr == ':program': prgm = store_program(get_input) execute_program(prgm) elif expr == ':code': prgm = store_program(get_input) execute_program(prgm, env) elif expr.startswith('when ') or is_clause_opener(expr) or \ not brace_matcher.match_line(expr).is_complete(): prgm = store_code_block(get_input, expr, brace_matcher) if prgm.rstrip('\n').count('\n') == 0 and \ not line_manager.is_statement(prgm): print_evaluated_expr(prgm, env) else: execute_program(prgm, env) elif expr == 'this': print(env.frames) else: # Since expr could contain semicolon-separated lines of code, # extract all the lines: line_mgr, _ = convert_program_to_lines(expr) line_mgr.classify_statements() if len(line_mgr) > 1: leading_lines = line_mgr[:-1] execution.execute_lines(leading_lines, env) last_expr_data = line_mgr.get_line_data(-1) last_expr = last_expr_data.line if last_expr_data.is_statement: execution.execute_statement(last_expr_data, False, env) else: print_evaluated_expr(last_expr, env)
def run_fourier_q_learning_experiments(params): statistics = {"errors": [], "stopping_points": [], "utilities": []} filename = RESULTS_DIRECTORY + "fourier-q-[{}]-[{}]-[{}]-{}".format( params["alpha"], params["epsilon"], params["order"], PROBLEM_FILE) metareasoning_env = env.Environment(PROBLEM_FILE_PATH, ALPHA, BETA, INCREMENT) prakhar = fourier_agent.Agent(params, metareasoning_env) prakhar.run_q_learning(statistics) utils.save(filename, statistics) return utils.get_results(statistics["errors"], WINDOW_SIZE, PLOT_WINDOW_SIZE)
def __init__(self, random_seed=RANDOM_SEED): np.random.seed(RANDOM_SEED) self.action_space = spaces.Box(low=0., high=60., shape=(2, ), dtype=np.float32) self.observation_space = spaces.Box(0, 10.0, (S_LEN * S_INFO, ), dtype=np.float32) all_cooked_time, all_cooked_bw, _ = load_trace.load_trace() self.net_env = env.Environment(all_cooked_time=all_cooked_time, all_cooked_bw=all_cooked_bw) self.last_bit_rate = DEFAULT_QUALITY self.buffer_size = 0. self.state = np.zeros((S_INFO, S_LEN)) self.reset()
def test(): metareasoning_env = env.Environment(PROBLEM_FILE_PATH, ALPHA, BETA, INCREMENT) quality, time = metareasoning_env.reset() qualities = [quality] utility = utils.get_time_dependent_utility(quality, time, ALPHA, BETA) utilities = [utility] while True: (quality, time), _, is_episode_done = metareasoning_env.step( metareasoning_env.CONTINUE_ACTION) qualities.append(quality) utilities.append( utils.get_time_dependent_utility(quality, time, ALPHA, BETA)) if is_episode_done: break plt.figure(figsize=(7, 3)) plt.rcParams["font.family"] = "Times New Roman" plt.rcParams["font.size"] = 14 plt.rcParams["grid.linestyle"] = "-" plt.xlabel("Steps") plt.ylabel("Utilities") plt.grid(True) axis = plt.gca() axis.spines["top"].set_visible(False) # axis.set_xlim([0, 2 * utilities.index(max(utilities))]) # axis.set_ylim([utilities[0], 1.05 * max(utilities)]) plt.plot(range(len(utilities)), utilities, color="r") plt.tight_layout() plt.show()
def main(): """Main function - includes tests and runs the REPL.""" argc = len(sys.argv) if argc > 1: first_arg = sys.argv[1] if first_arg == '--test': env = environment.Environment() execution.execute_statement('x = 3', env) execution.execute_statement('x+=7', env) execution.execute_statement('y=9.23', env) env.new_frame() execution.execute_statement('x = 5', env) print(env.frames) execution.execute_statement('z="hello world"', env) execution.execute_statement('z +="!!!"', env) execution.execute_statement('a= `gelatin`', env) print(env.frames) ast = AST("3*4+5 ^ 7") print(ast.parse()) print(ast.collapse_indices(ast.build_indices())) ast = AST("18+15*9:3+10") print(ast.parse()) print(ast.collapse_indices(ast.build_indices())) print( execution.evaluate_expression('1+2+3+4', environment.Environment())) print( execution.evaluate_expression('45+7*8', environment.Environment())) print( execution.evaluate_expression('3.2+18^2-7', environment.Environment())) print( execution.evaluate_expression('1:2 + 1:3 + 1:5', environment.Environment())) print( execution.evaluate_expression('2:3 + 3^3 - 1:5', environment.Environment())) print( execution.evaluate_expression('1234', environment.Environment())) ast = AST("3 + 1 == 4") print(ast.parse()) ast = AST("3 + 1 > 4") print(ast.parse()) ast = AST("18:1 != 18.2") print(ast.parse()) ast = AST("x = 4") print(ast.parse()) ast = AST("y = 3 > 4") print(ast.parse()) env2 = environment.Environment() execution.execute_statement('x = 3+5*4', env2) execution.execute_statement('y = x + 19 - 3*6', env2) print(env2.frames) elif first_arg == '--test2': ast = AST('x = "ice cream, eggs, and milk" + "...alpha or beta"') print(ast.parse()) ast = AST('y = f(1 + 1, 2 + 2, 3 + 3) - g((9+7)*2, 128/(2+2))') print(ast.parse()) ast = AST( 'z = f("ice cream", "eggs and milk") * g("alpha or beta", 3:8, "gamma or delta")' ) print(ast.parse()) ast = AST('makeList(1,2,3) + makeList(4,5,6)') print(ast.parse()) ast = AST('[max(16, 25), max(36, max(49, 64))]') print(ast.parse()) ast = AST('[concat_lists([10], [20]), concat_lists([30], [40])]') print(ast.parse()) elif first_arg == '--test3': ast = AST('[1, 2, 3]') print(ast.split_list_elems()) ast = AST('[f(2), f(3), f(4)]') print(ast.split_list_elems()) ast = AST('[f(2, 3), f(3, 4, 5), f(4, 1)]') print(ast.split_list_elems()) ast = AST('1 + 2 * 3') print(ast.split_list_elems()) print(ast.parse()) elif first_arg == '--test4': ast = AST('x.length()') print(ast.parse()) ast = AST('[1,2,3].length()') print(ast.parse()) ast = AST('3.01') print(ast.parse()) ast = AST('3.1') print(ast.parse()) elif first_arg == '--test5': env = environment.Environment() env.new_type(['Number'], 'ComplexNumber') c = {'$type': 'ComplexNumber', 'real': 1, 'imag': 2} print(env.value_is_a(c, 'ComplexNumber')) print(env.value_is_a(c, 'Number')) print(env.value_is_a(c, 'Int')) print("") env.new_type(['Object'], 'Food') env.new_type(['Food'], 'Pizza') env.new_type(['Food'], 'Dessert') env.new_type(['Dessert'], 'ChocolateItem') env.new_type(['Pizza'], 'PepperoniPizza') env.new_type(['Pizza', 'ChocolateItem'], 'ChocolatePizza') pepperoni_pizza = {'$type': 'PepperoniPizza'} chocolate_pizza = {'$type': 'ChocolatePizza'} print(env.value_is_a(pepperoni_pizza, 'PepperoniPizza')) print(env.value_is_a(pepperoni_pizza, 'Pizza')) print(env.value_is_a(pepperoni_pizza, 'Food')) print(env.value_is_a(pepperoni_pizza, 'Dessert')) print(env.value_is_a(pepperoni_pizza, 'ChocolateItem')) print("") print(env.value_is_a(chocolate_pizza, 'PepperoniPizza')) print(env.value_is_a(chocolate_pizza, 'Pizza')) print(env.value_is_a(chocolate_pizza, 'Food')) print(env.value_is_a(chocolate_pizza, 'Dessert')) print(env.value_is_a(chocolate_pizza, 'ChocolateItem')) print("") env.new_type(['ChocolatePizza'], 'HugeChocolatePizza') huge_chocolate_pizza = {'$type': 'HugeChocolatePizza'} print(env.value_is_a(huge_chocolate_pizza, 'PepperoniPizza')) print(env.value_is_a(huge_chocolate_pizza, 'Pizza')) print(env.value_is_a(huge_chocolate_pizza, 'Food')) print(env.value_is_a(huge_chocolate_pizza, 'Dessert')) print(env.value_is_a(huge_chocolate_pizza, 'ChocolateItem')) print(env.value_is_a(huge_chocolate_pizza, 'ChocolatePizza')) print("") elif first_arg == '--test6': ast = AST('{1, 2 | 3, 4}') print(ast.parse()) elif first_arg == '--test7': ast = AST('throw "something"') print(ast.parse()) elif first_arg == '--test8': ast = AST('true and not false') print(ast.parse()) print(ast.collapse_indices(ast.build_indices())) elif first_arg == '--test9': sample = """ x = 5 // comment // comment /* multi line comment */y = 6 z = "https://example.com" """ print(prepare_program.preprocess(sample)) elif first_arg == '--test10': ast = AST('-3.0e5 + 186e-20 * 1e-6 / 28.8e+6 + 34.4e+99') print(ast.parse()) ast = AST('-3.0E5 + 186E-20 * 1E-6 / 28.8e+6 + 34.4E+99') print(ast.parse()) elif first_arg == '--test11': print(execution.is_assignment_statement('a = 5')) print(execution.is_assignment_statement('a=5==6')) print(execution.is_assignment_statement('not (5==6) and (8>=7)')) print(execution.is_assignment_statement('z=')) elif first_arg == '--test12': lines = [ 'sub this + that', 'func Int x + this', 'func x + this', 'func this * y', 'func Int -this', 'sub -this', 'sub not this', 'sub Boolean not this', 'sub this-b', 'sub b-this', 'func Int-this', 'func Int- this', 'sub Int - this' ] print(prepare_program.replace_op_overload_syntax(lines)) elif first_arg == '--test-tree-merge': tests.test_tree_merge() elif first_arg == '--test-all': tests.test_all('capacita_programs') elif first_arg == '--test-all-fast': tests.test_all('capacita_programs', has_delay=False) elif first_arg == '--test-repl': tests.test_all('capacita_programs', has_delay=True, use_repl=True) elif first_arg == '--test-repl-fast': tests.test_all('capacita_programs', has_delay=False, use_repl=True) elif first_arg == '--test-file' and argc > 2: if argc == 4 and sys.argv[2] == '--repl': tests.test_file(sys.argv[3], use_repl=True) else: tests.test_file(sys.argv[2], use_repl=False) else: # Run a program from a text file: file_name = first_arg execute_file(file_name) exit() repl()
def agent(agent_id, net_params_queue, exp_queue): net_env = env.Environment(random_seed=agent_id, fixed_env=False, trace_folder=TRAIN_TRACES) with tf.Session() as sess, open(LOG_FILE + '_agent_' + str(agent_id), 'wb') as log_file: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) # initial synchronization of the network parameters from the coordinator actor_net_params, critic_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) mask = net_env.video_masks[net_env.video_idx] last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY action = bitrate_to_action(bit_rate, mask) last_action = action action_vec = np.zeros(np.sum(mask)) action_vec[bit_rate] = 1 s_batch = [np.zeros((S_INFO, S_LEN))] a_batch = [action_vec] r_batch = [] entropy_record = [] time_stamp = 0 while True: # experience video streaming forever # the action is from the last decision # this is to make the framework similar to the real delay, sleep_time, buffer_size, \ rebuf, video_chunk_size, end_of_video, \ video_chunk_remain, video_num_chunks, \ next_video_chunk_size, mask = \ net_env.get_video_chunk(bit_rate) time_stamp += delay # in ms time_stamp += sleep_time # in ms reward = VIDEO_BIT_RATE[action] / M_IN_K \ - REBUF_PENALTY * rebuf \ - SMOOTH_PENALTY * np.abs(VIDEO_BIT_RATE[action] - VIDEO_BIT_RATE[last_action]) / M_IN_K r_batch.append(reward) last_bit_rate = bit_rate last_action = action # retrieve previous state if len(s_batch) == 0: state = [np.zeros((S_INFO, S_LEN))] else: state = np.array(s_batch[-1], copy=True) # dequeue history record state = np.roll(state, -1, axis=1) # this should be S_INFO number of terms state[0, -1] = VIDEO_BIT_RATE[action] / float( np.max(VIDEO_BIT_RATE)) # last quality state[1, -1] = buffer_size / BUFFER_NORM_FACTOR state[2, -1] = float(video_chunk_size) / float( delay) / M_IN_K # kilo byte / ms state[3, -1] = float(delay) / M_IN_K state[4, -1] = video_chunk_remain / float(video_num_chunks) state[5, :] = -1 nxt_chnk_cnt = 0 for i in xrange(A_DIM): if mask[i] == 1: state[5, i] = next_video_chunk_size[nxt_chnk_cnt] / M_IN_B nxt_chnk_cnt += 1 assert (nxt_chnk_cnt) == np.sum(mask) state[6, -A_DIM:] = mask # compute action probability vector action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN))) # the action probability should correspond to number of bit rates assert len(action_prob[0]) == np.sum(mask) action_cumsum = np.cumsum(action_prob) bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() # Note: we need to discretize the probability into 1/RAND_RANGE steps, # because there is an intrinsic discrepancy in passing single state and batch states action = bitrate_to_action(bit_rate, mask) entropy_record.append(a3c.compute_entropy(action_prob[0])) # log time_stamp, bit_rate, buffer_size, reward log_file.write( str(time_stamp) + '\t' + str(VIDEO_BIT_RATE[action]) + '\t' + str(buffer_size) + '\t' + str(rebuf) + '\t' + str(video_chunk_size) + '\t' + str(delay) + '\t' + str(reward) + '\n') log_file.flush() # report experience to the coordinator if len(r_batch) >= TRAIN_SEQ_LEN or end_of_video: exp_queue.put([ s_batch[1:], # ignore the first chuck a_batch[1:], # since we don't have the r_batch[1:], # control over it end_of_video, { 'entropy': entropy_record } ]) # synchronize the network parameters from the coordinator actor_net_params, critic_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) del s_batch[:] del a_batch[:] del r_batch[:] del entropy_record[:] log_file.write( '\n') # so that in the log we know where video ends # store the state and action into batches if end_of_video: last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY # use the default action here action = bitrate_to_action(bit_rate, mask) last_action = action action_vec = np.zeros(np.sum(mask)) action_vec[bit_rate] = 1 s_batch.append(np.zeros((S_INFO, S_LEN))) a_batch.append(action_vec) else: s_batch.append(state) action_vec = np.zeros(np.sum(mask)) action_vec[bit_rate] = 1 a_batch.append(action_vec)
import json from collections import defaultdict import git import env import log from base import ProcessName, CommitBuilder, iter_tree, iter_process_names from repos import pygit2_get logger = log.get_logger(__name__) env = env.Environment() class Index(object): name = None key_fields = () unique = False value_cls = tuple # Overridden in subclasses using the constructor # This provides a kind of borg pattern where all instances of # the class have the same changes data changes = None def __init__(self, repo): if self.__class__.changes is None: self.reset() self.repo = repo self.pygit2_repo = pygit2_get(repo)
def main(): np.random.seed(RANDOM_SEED) assert len(VIDEO_BIT_RATE) == A_DIM all_cooked_time, all_cooked_bw, all_file_names = load_trace.load_trace( TEST_TRACES) net_env = env.Environment(all_cooked_time=all_cooked_time, all_cooked_bw=all_cooked_bw) log_path = LOG_FILE + '_' + all_file_names[net_env.trace_idx] log_file = open(log_path, 'w') actor = a2c_torch.ActorNet(s_dim=[S_INFO, S_LEN], a_dim=A_DIM, lr=ACTOR_LR_RATE) critic = a2c_torch.CriticNet(s_dim=[S_INFO, S_LEN], lr=CRITIC_LR_RATE) # restore neural net parameters if NN_MODEL is not None: # NN_MODEL is the path to file # saver.restore(sess, NN_MODEL) print(NN_MODEL) actor.load_state_dict(torch.load(NN_MODEL)) print("Testing model restored.") time_stamp = 0 last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch = [np.zeros((S_INFO, S_LEN))] a_batch = [action_vec] r_batch = [] entropy_record = [] video_count = 0 while True: # serve video forever # the action is from the last decision # this is to make the framework similar to the real delay, sleep_time, buffer_size, rebuf, \ video_chunk_size, next_video_chunk_sizes, \ end_of_video, video_chunk_remain = \ net_env.get_video_chunk(bit_rate) time_stamp += delay # in ms time_stamp += sleep_time # in ms # reward is video quality - rebuffer penalty - smoothness reward = VIDEO_BIT_RATE[bit_rate] / M_IN_K \ - REBUF_PENALTY * rebuf \ - SMOOTH_PENALTY * np.abs(VIDEO_BIT_RATE[bit_rate] - VIDEO_BIT_RATE[last_bit_rate]) / M_IN_K r_batch.append(reward) last_bit_rate = bit_rate # log time_stamp, bit_rate, buffer_size, reward log_file.write( str(time_stamp / M_IN_K) + '\t' + str(VIDEO_BIT_RATE[bit_rate]) + '\t' + str(buffer_size) + '\t' + str(rebuf) + '\t' + str(video_chunk_size) + '\t' + str(delay) + '\t' + str(reward) + '\n') log_file.flush() # retrieve previous state if len(s_batch) == 0: state = [np.zeros((S_INFO, S_LEN))] else: state = np.array(s_batch[-1], copy=True) # dequeue history record state = np.roll(state, -1, axis=1) # this should be S_INFO number of terms state[0, -1] = VIDEO_BIT_RATE[bit_rate] / float( np.max(VIDEO_BIT_RATE)) # last quality state[1, -1] = buffer_size / BUFFER_NORM_FACTOR # 10 sec state[2, -1] = float(video_chunk_size) / float( delay) / M_IN_K # kilo byte / ms state[3, -1] = float(delay) / M_IN_K / BUFFER_NORM_FACTOR # 10 sec state[4, :A_DIM] = np.array( next_video_chunk_sizes) / M_IN_K / M_IN_K # mega byte state[5, -1] = np.minimum( video_chunk_remain, CHUNK_TIL_VIDEO_END_CAP) / float(CHUNK_TIL_VIDEO_END_CAP) _, _, action_prob = actor.get_actor_out( convert_torch(np.reshape(state, (1, S_INFO, S_LEN)))) action_prob = action_prob.numpy() action_cumsum = np.cumsum(action_prob) bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() # action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN))) # action_cumsum = np.cumsum(action_prob) # print('action:', action_cumsum) # bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() # bit_rate = np.argmax(action_prob) # Note: we need to discretize the probability into 1/RAND_RANGE steps, # because there is an intrinsic discrepancy in passing single state and batch states s_batch.append(state) entropy_record.append(a2c_torch.compute_entropy(action_prob[0])) if end_of_video: log_file.write('\n') log_file.close() last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY # use the default action here del s_batch[:] del a_batch[:] del r_batch[:] action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch.append(np.zeros((S_INFO, S_LEN))) a_batch.append(action_vec) entropy_record = [] video_count += 1 if video_count >= len(all_file_names): break log_path = LOG_FILE + '_' + all_file_names[net_env.trace_idx] log_file = open(log_path, 'w')
def local_train(index, args, global_model, actor_optimizer, critic_optimizer, save=False): torch.manual_seed(614 + index) if save: start_time = timeit.default_timer() writer = SummaryWriter(args.log_path) all_cooked_time, all_cooked_bw, _ = load_trace.load_trace(args.train_traces) net_env = env.Environment(all_cooked_time=all_cooked_time, all_cooked_bw=all_cooked_bw, random_seed=index) local_model = a3c.ActorCritic(state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=[ACTOR_LR_RATE, CRITIC_LR_RATE], islstm = args.islstm) # local_model = a3c.A3C(state_dim=[S_INFO, S_LEN], # action_dim=A_DIM, # learning_rate=[ACTOR_LR_RATE, CRITIC_LR_RATE]) local_model.train() local_model._initialize_weights() if args.use_gpu: local_model.cuda() done = True curr_step = 0 curr_episode = 0 last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY time_stamp = 0 interval_aloss = 0 interval_closs = 0 interval_entropy = 0 interval_reward = [] sum_reward = 0 count_reware = 0 while True: curr_episode += 1 local_model.load_state_dict(global_model.state_dict()) state = torch.zeros(S_INFO, S_LEN) if done: cx = torch.zeros(1, 128) hx = torch.zeros(1, 128) else: cx = cx.detach() hx = hx.detach() if args.use_gpu: state = state.cuda() log_policies = [] values = [] rewards = [] entropies = [] # One video while True: delay, sleep_time, buffer_size, rebuf, \ video_chunk_size, next_video_chunk_sizes, \ end_of_video, video_chunk_remain = \ net_env.get_video_chunk(bit_rate) time_stamp += delay # in ms time_stamp += sleep_time # in ms # -- linear reward -- # reward is video quality - rebuffer penalty - smoothness reward = VIDEO_BIT_RATE[bit_rate] / M_IN_K \ - REBUF_PENALTY * rebuf \ - SMOOTH_PENALTY * np.abs(VIDEO_BIT_RATE[bit_rate] - VIDEO_BIT_RATE[last_bit_rate]) / M_IN_K # -- log scale reward -- # log_bit_rate = np.log(VIDEO_BIT_RATE[bit_rate] / float(VIDEO_BIT_RATE[-1])) # log_last_bit_rate = np.log(VIDEO_BIT_RATE[last_bit_rate] / float(VIDEO_BIT_RATE[-1])) # reward = log_bit_rate \ # - REBUF_PENALTY * rebuf \ # - SMOOTH_PENALTY * np.abs(log_bit_rate - log_last_bit_rate) # -- HD reward -- # reward = HD_REWARD[bit_rate] \ # - REBUF_PENALTY * rebuf \ # - SMOOTH_PENALTY * np.abs(HD_REWARD[bit_rate] - HD_REWARD[last_bit_rate]) last_bit_rate = bit_rate state = torch.roll(state, -1) # Fill in the state vector with normalization state[0, -1] = torch.Tensor([VIDEO_BIT_RATE[last_bit_rate] / float(max(VIDEO_BIT_RATE))]) # last quality state[1, -1] = torch.Tensor([buffer_size / BUFFER_NORM_FACTOR]) # buffer size state[2, -1] = torch.Tensor([float(video_chunk_size) / float(delay) / M_IN_K]) # kilo byte / ms state[3, -1] = torch.Tensor([float(delay) / M_IN_K / BUFFER_NORM_FACTOR]) # /10 sec state[4, :A_DIM] = torch.Tensor([next_video_chunk_sizes]) / M_IN_K / M_IN_K # mega byte # remaining chunk number state[5, -1] = torch.Tensor([min(video_chunk_remain, CHUNK_TIL_VIDEO_END_CAP) / float(CHUNK_TIL_VIDEO_END_CAP)]) if args.islstm == 0: logits, value = local_model(state.unsqueeze(dim=0)) else: logits, value, hx, cx = local_model((state.unsqueeze(dim=0),hx,cx)) # print(f"index {index}, state {state}, logits {logits}, value {value}",sep="\n") # print(state,logits) try: cate = Categorical(logits) bit_rate = cate.sample().item() except Exception as e: print(e) print(f"walking into an error of all null distribution in step {curr_step}") print(logits, state) exit() policy = logits log_policy = torch.log(logits) entropy = (policy * log_policy).sum(1, keepdim=True) if curr_step > args.num_global_steps: done = True curr_step += 1 values.append(value) rewards.append(reward) log_policies.append(log_policy[0, bit_rate]) entropies.append(entropy) if end_of_video: last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY # use the default action here break score = torch.zeros((1, 1), dtype=torch.float) if args.use_gpu: score = score.cuda() if not done: _, score = local_model(state.unsqueeze(dim=0)) gae = torch.zeros((1, 1), dtype=torch.float) if args.use_gpu: gae = gae.cuda() actor_loss = 0 critic_loss = 0 entropy_loss = 0 next_value = score # for value, log_policy, reward, entropy in list(zip(values, log_policies, rewards, entropies))[::-1]: # gae = gae * args.gamma * args.tau # gae = gae + reward + args.gamma * next_value.detach() - value.detach() # next_value = value # actor_loss = actor_loss + log_policy * gae # score = score * args.gamma + reward # critic_loss = critic_loss + (score - value) ** 2 / 2 # entropy_loss = entropy_loss + entropy for value, log_policy, reward, entropy in list(zip(values, log_policies, rewards, entropies))[::-1]: gae = gae * args.gamma * args.tau gae = gae + reward + args.gamma * next_value.detach() - value.detach() next_value = value actor_loss = actor_loss + log_policy * gae score = score * args.gamma + reward critic_loss = critic_loss + (score - value) ** 2 / 2 entropy_loss = entropy_loss + entropy entropy_loss = args.beta * (entropy_loss ) actor_loss = -actor_loss + args.beta * entropy_loss # total_loss = -actor_loss + critic_loss - entropy_loss writer.add_scalar("Train_{}/Loss".format(index), actor_loss, critic_loss, curr_episode) actor_optimizer.zero_grad() critic_optimizer.zero_grad() actor_loss.backward() critic_loss.backward() # gradient clipping torch.nn.utils.clip_grad_norm_(global_model.parameters(), args.max_grad_norm) # total_loss.backward() # (-critic_loss).backward() # (actor_loss+args.beta*entropy_loss).backward() for local_param, global_param in zip(local_model.parameters(), global_model.parameters()): if global_param.grad is not None: break global_param._grad = local_param.grad actor_optimizer.step() critic_optimizer.step() interval_aloss += actor_loss.data.item() interval_closs += critic_loss.data.item() interval_entropy += entropy_loss.data.item() interval_reward.append(np.sum(rewards)) if curr_episode % print_interval == 0 : print("---------") print(f"Process {index}, episode {curr_episode}\n"+ f"actor_loss [{interval_aloss/print_interval:4f}] " f"critic_loss [{interval_closs/print_interval:4f}] " f"entropy [{interval_entropy/print_interval:4f}]\n" f"reward [{interval_reward}]") if save and curr_episode % args.save_interval == 0 and curr_episode > 0: torch.save(global_model.state_dict(), f"{args.saved_path}/a3c_{curr_episode}_reward_{sum_reward/count_reware:4f}.pkl") sum_reward += np.sum(interval_reward) count_reware += 1 interval_aloss = 0 interval_closs = 0 interval_entropy = 0 interval_reward = [] if curr_episode == int(args.num_global_steps / args.num_local_steps): print("Training process {} terminated".format(index)) if save: end_time = timeit.default_timer() print('The code runs for %.2f s ' % (end_time - start_time)) return
c_ent=opts["c_ent"]) # OPTIONAL, LOAD AGENT if "load" in opts.keys(): model.load(path=opts["load"], ext="_last") # INITIALIZE ROSNODE rospy.init_node("training_node", anonymous=True) rate = rospy.Rate(100) rospy.sleep(1.0) robot = torobo_wrapper.Torobo() rospy.sleep(1.0) # INITIALIZE ENVIRONMENT world = env.Environment(robot=robot, objects=opts["objects"], rng_ranges=opts["ranges"]) rospy.sleep(0.5) world.initialize() print("=" * 10 + "POLICY NETWORK" + "=" * 10) print(model.policy) print("=" * 10 + "VALUE NETWORK" + "=" * 10) print(model.value) print("Training starts...") reward_history = [] temp_history = [] it = 0 update_count = 0 while update_count < opts["episode"]:
random_seed = 2 video_count = 0 FPS = 25 frame_time_len = 0.04 #init the environment #setting one: # 1,all_cooked_time : timestamp # 2,all_cooked_bw : throughput # 3,all_cooked_rtt : rtt # 4,agent_id : random_seed # 5,logfile_path : logfile_path # 6,VIDEO_SIZE_FILE : Video Size File Path # 7,Debug Setting : Debug net_env = env.Environment(all_cooked_time=all_cooked_time, all_cooked_bw=all_cooked_bw, random_seed=random_seed, logfile_path=LogFile_Path, VIDEO_SIZE_FILE=video_size_file, Debug=DEBUG) BIT_RATE = [500.0, 850.0, 1200.0, 1850.0] # kpbs TARGET_BUFFER = [2.0, 3.0] # seconds # ABR setting RESEVOIR = 0.5 CUSHION = 2 cnt = 0 # defalut setting last_bit_rate = 0 bit_rate = 0 target_buffer = 0 # QOE setting
def agent(agent_id, all_cooked_time, all_cooked_bw, net_params_queue, exp_queue, epoch_queue): net_env = env.Environment(all_cooked_time=all_cooked_time, all_cooked_bw=all_cooked_bw, random_seed=agent_id) with tf.Session() as sess, open(LOG_FILE + '_agent_' + str(agent_id), 'wb') as log_file: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) # 1.从center同步最新的模型参数 initial synchronization of the network parameters from the coordinator actor_net_params, critic_net_params = net_params_queue.get() epoch_num = epoch_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY action_vec = np.zeros(A_DIM) #初始化 动作空间A个actions action_vec[bit_rate] = 1 s_batch = [np.zeros((S_INFO, S_LEN))] a_batch = [action_vec] r_batch = [] entropy_record = [] time_stamp = 0 while True: # experience video streaming forever # 和环境Env交互 the action is from the last decision # this is to make the framework similar to the real # delay, sleep_time, buffer_size, rebuf, \ # video_chunk_size, next_video_chunk_sizes, \ # end_of_video, video_chunk_remain = \ # net_env.get_video_chunk(bit_rate) assert bit_rate >= 0 assert bit_rate < A_DIM bitrate_send_last, lossrate_recv_last, bitrate_real_recovery,\ bitrate_send_last_probe, lossrate_recv_last_probe, bitrate_real_recovery_probe,\ end_of_video \ = net_env.action_dispatch_and_report_svr(VIDEO_BIT_RATE[bit_rate]) time_stamp += 2 # -- linear reward -- # reward is video quality - rebuffer penalty - smoothness #print '1', net_env.netbw #print '2', bitrate_send_last_probe * (1 - lossrate_recv_last_probe) x_funtion_top = (bitrate_send_last_probe * (1 - lossrate_recv_last_probe) - VIDEO_BIT_RATE[bit_rate]) / M_IN_K reward = -x_funtion_top * x_funtion_top # 0.1 0.2 ... 1.1 1.2 r_batch.append(reward) last_bit_rate = bit_rate # retrieve previous state if len(s_batch) == 0: state = [np.zeros((S_INFO, S_LEN))] else: state = np.array(s_batch[-1], copy=True) # dequeue history record #state = np.roll(state, -1, axis=1) # this should be S_INFO number of terms #state[0, -1] = bitrate_send_last / 1000.0 # last quality #state[1, -1] = lossrate_recv_last # 丢包率0.1 0.2 0.3 0.4 #state[2, -1] = bitrate_real_recovery / 1000.0 # kilo byte / ms state = np.roll(state, -1, axis=1) state[0, -1] = bitrate_send_last_probe / 1000.0 # last quality state[1, -1] = lossrate_recv_last_probe # 丢包率0.1 0.2 0.3 0.4 state[2, -1] = bitrate_real_recovery_probe / 1000.0 # kilo byte / ms state[3, :A_DIM] = np.array( VIDEO_BIT_RATE[:]) / 1000.0 # kilo byte / ms state[4, -1] = bitrate_send_last / 1000.0 # kilo byte / ms # print state[3, :A_DIM] # ================== Predict BandWidth ========================= # compute action probability vector action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN))) action_cumsum = np.cumsum(action_prob) bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() # Note: we need to discretize the probability into 1/RAND_RANGE steps, # because there is an intrinsic discrepancy in passing single state and batch states entropy_record.append(a3c.compute_entropy(action_prob[0])) # log time_stamp, bit_rate, buffer_size, reward log_file.write( str(time_stamp) + '\t' + str(VIDEO_BIT_RATE[bit_rate]) + '\t' + str(bitrate_send_last) + '\t' + str(lossrate_recv_last) + '\t' + str(bitrate_real_recovery) + '\t' + str(reward) + '\n') log_file.flush() # report experience to the coordinator if len(r_batch) >= TRAIN_SEQ_LEN or end_of_video: exp_queue.put([ s_batch[1:], # ignore the first chuck a_batch[1:], # since we don't have the r_batch[1:], # control over it end_of_video, { 'entropy': entropy_record } ]) # synchronize the network parameters from the coordinator actor_net_params, critic_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) epoch_num = epoch_queue.get() del s_batch[:] del a_batch[:] del r_batch[:] del entropy_record[:] log_file.write( '\n') # so that in the log we know where video ends # store the state and action into batches if end_of_video: last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY # use the default action here action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch.append(np.zeros((S_INFO, S_LEN))) a_batch.append(action_vec) else: s_batch.append(state) action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 a_batch.append(action_vec)
def main(): np.random.seed(RANDOM_SEED) assert len(VIDEO_BIT_RATE) == A_DIM all_cooked_time, all_cooked_bw, _ = load_trace.load_trace() if not os.path.exists(SUMMARY_DIR): os.makedirs(SUMMARY_DIR) net_env = env.Environment(all_cooked_time=all_cooked_time, all_cooked_bw=all_cooked_bw) with tf.Session() as sess, open(LOG_FILE, 'wb') as log_file: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) summary_ops, summary_vars = a3c.build_summaries() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph) # training monitor saver = tf.train.Saver() # save neural net parameters # restore neural net parameters nn_model = NN_MODEL if nn_model is not None: # nn_model is the path to file saver.restore(sess, nn_model) print("Model restored.") epoch = 0 time_stamp = 0 last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch = [np.zeros((S_INFO, S_LEN))] a_batch = [action_vec] r_batch = [] entropy_record = [] actor_gradient_batch = [] critic_gradient_batch = [] while True: # serve video forever # the action is from the last decision # this is to make the framework similar to the real delay, sleep_time, buffer_size, rebuf, \ video_chunk_size, next_video_chunk_sizes, \ end_of_video, video_chunk_remain = \ net_env.get_video_chunk(bit_rate) time_stamp += delay # in ms time_stamp += sleep_time # in ms # reward is video quality - rebuffer penalty - smooth penalty reward = VIDEO_BIT_RATE[bit_rate] / M_IN_K \ - REBUF_PENALTY * rebuf \ - SMOOTH_PENALTY * np.abs(VIDEO_BIT_RATE[bit_rate] - VIDEO_BIT_RATE[last_bit_rate]) / M_IN_K r_batch.append(reward) last_bit_rate = bit_rate # retrieve previous state if len(s_batch) == 0: state = [np.zeros((S_INFO, S_LEN))] else: state = np.array(s_batch[-1], copy=True) # dequeue history record state = np.roll(state, -1, axis=1) # this should be S_INFO number of terms state[0, -1] = VIDEO_BIT_RATE[bit_rate] / float( np.max(VIDEO_BIT_RATE)) # last quality state[1, -1] = buffer_size / BUFFER_NORM_FACTOR # 10 sec state[2, -1] = float(video_chunk_size) / float( delay) / M_IN_K # kilo byte / ms state[3, -1] = float(delay) / M_IN_K / BUFFER_NORM_FACTOR # 10 sec state[4, :A_DIM] = np.array( next_video_chunk_sizes) / M_IN_K / M_IN_K # mega byte state[5, -1] = np.minimum( video_chunk_remain, CHUNK_TIL_VIDEO_END_CAP) / float(CHUNK_TIL_VIDEO_END_CAP) action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN))) action_cumsum = np.cumsum(action_prob) bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() # Note: we need to discretize the probability into 1/RAND_RANGE steps, # because there is an intrinsic discrepancy in passing single state and batch states entropy_record.append(a3c.compute_entropy(action_prob[0])) # log time_stamp, bit_rate, buffer_size, reward log_file.write( str(time_stamp) + '\t' + str(VIDEO_BIT_RATE[bit_rate]) + '\t' + str(buffer_size) + '\t' + str(rebuf) + '\t' + str(video_chunk_size) + '\t' + str(delay) + '\t' + str(reward) + '\n') log_file.flush() if len(r_batch ) >= TRAIN_SEQ_LEN or end_of_video: # do training once actor_gradient, critic_gradient, td_batch = \ a3c.compute_gradients(s_batch=np.stack(s_batch[1:], axis=0), # ignore the first chuck a_batch=np.vstack(a_batch[1:]), # since we don't have the r_batch=np.vstack(r_batch[1:]), # control over it terminal=end_of_video, actor=actor, critic=critic) td_loss = np.mean(td_batch) actor_gradient_batch.append(actor_gradient) critic_gradient_batch.append(critic_gradient) print "====" print "Epoch", epoch print "TD_loss", td_loss, "Avg_reward", np.mean( r_batch), "Avg_entropy", np.mean(entropy_record) print "====" summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: td_loss, summary_vars[1]: np.mean(r_batch), summary_vars[2]: np.mean(entropy_record) }) writer.add_summary(summary_str, epoch) writer.flush() entropy_record = [] if len(actor_gradient_batch) >= GRADIENT_BATCH_SIZE: assert len(actor_gradient_batch) == len( critic_gradient_batch) # assembled_actor_gradient = actor_gradient_batch[0] # assembled_critic_gradient = critic_gradient_batch[0] # assert len(actor_gradient_batch) == len(critic_gradient_batch) # for i in xrange(len(actor_gradient_batch) - 1): # for j in xrange(len(actor_gradient)): # assembled_actor_gradient[j] += actor_gradient_batch[i][j] # assembled_critic_gradient[j] += critic_gradient_batch[i][j] # actor.apply_gradients(assembled_actor_gradient) # critic.apply_gradients(assembled_critic_gradient) for i in xrange(len(actor_gradient_batch)): actor.apply_gradients(actor_gradient_batch[i]) critic.apply_gradients(critic_gradient_batch[i]) actor_gradient_batch = [] critic_gradient_batch = [] epoch += 1 if epoch % MODEL_SAVE_INTERVAL == 0: # Save the neural net parameters to disk. save_path = saver.save( sess, SUMMARY_DIR + "/nn_model_ep_" + str(epoch) + ".ckpt") print("Model saved in file: %s" % save_path) del s_batch[:] del a_batch[:] del r_batch[:] if end_of_video: last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY # use the default action here action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch.append(np.zeros((S_INFO, S_LEN))) a_batch.append(action_vec) else: s_batch.append(state) action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 a_batch.append(action_vec)
import env import numpy as np e = env.Environment(8, 8) # 4(a) def valueIteration(e, pe, l): pi = dict() V = np.zeros((e.L, e.W, 12)) threshold = 1e-4 while True: delta = 0 for s in e.getStates(): vmax = -np.Inf for a in e.getActions(): v = env.expectation(e, s, a, V, pe, l) if v > vmax : vmax = v pi[s] = a delta = max(delta, np.abs(V[s] - vmax)) V[s] = vmax print(delta) if delta < threshold: break return pi, V # 4(b) pistar, V = valueIteration(e, 0, 0.9) tr = env.gen_traj(e, pistar, (1, 6, 6), 0) # 4(c)
parser.add_argument('--mode', '-m') parser.add_argument('--model') #no training episodes parser.add_argument('--eps') parser.add_argument('--render') args = parser.parse_args() if args.tensorboard: writer = SummaryWriter() write_proc = subprocess.Popen(['tensorboard', '--logdir', '{}'.format(args.tensorboard)]) env = env.Environment(args.env) if args.alg == 'DQN': agent = agent.DQNAgent(env, args.mode, args.model, writer) try: if args.mode == 'train': agent.train(int(args.eps), args.render) elif args.mode == 'play': agent.play(int(args.eps)) except KeyboardInterrupt: print('PROCESS KILLED BY USER') finally: env.close() if args.tensorboard: write_proc.terminate()
def agent(agent_id, all_cooked_time, all_cooked_bw, net_params_queue, exp_queue): net_env = env.Environment(time=all_cooked_time, bandwidth=all_cooked_bw, random_seed=agent_id) with tf.Session() as sess, open(LOG_FILE + '_agent_' + str(agent_id), 'wb') as log_file: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) # initial synchronization of the network parameters from the coordinator actor_net_params, critic_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch = [np.zeros((S_INFO, S_LEN))] a_batch = [action_vec] r_batch = [] entropy_record = [] #need to initialize, and get before simulation step track_index = [] hm = head_movement.move_prediction() time_stamp = 0 while True: # experience video streaming forever # the action is from the last decision # this is to make the framework similar to the real # xgw 20180918: need to modify here estimate_track_index = hm.get_head_movement_prediction() # actual_track_index = hm.get_head_movement_current() actual_track_index = [2, 3, 5, 6] delay, rebuf, buffer_size, sleep_time, video_chunk_size, end_of_video = \ net_env.get_video_chunk(bit_rate, estimate_track_index) time_stamp += delay # in ms time_stamp += sleep_time # in ms # -- linear reward -- # reward is video quality - rebuffer penalty - smoothness # xgw 20180918: need to modify the reward, add the qualiy consistency in viewport # and the buffer # actually the consistency of quality in viewport is the error of head movement prediction error # so it's not sure that whether add the "quality consistency" here # don't know how to modelized the qp as first input reward = VIDEO_BIT_RATE[bit_rate] / M_IN_K \ - REBUF_PENALTY * rebuf \ - SMOOTH_PENALTY * np.abs(VIDEO_BIT_RATE[bit_rate] - VIDEO_BIT_RATE[last_bit_rate]) / M_IN_K # bit_rate_log_reward = np.log((bit_rate + 1) / A_DIM) * BIT_RATE_REWARD_PARAMETER # smooth_p = np.exp(np.abs(last_bit_rate - bit_rate) / A_DIM) * SMOOTH_PENALTY # reward = bit_rate - REBUF_PENALTY * rebuf - smooth_p r_batch.append(reward) last_bit_rate = bit_rate # retrieve previous state if len(s_batch) == 0: state = [np.zeros((S_INFO, S_LEN))] else: state = np.array(s_batch[-1], copy=True) # dequeue history record state = np.roll(state, -1, axis=1) # this should be S_INFO number of terms # state[0, -1] = VIDEO_BIT_RATE[bit_rate] / float(np.max(VIDEO_BIT_RATE)) # last quality # state[1, -1] = buffer_size / BUFFER_NORM_FACTOR # 6 sec # state[2, -1] = float(video_chunk_size) / float(delay) / M_IN_K # kilo byte / ms # state[3, -1] = float(delay) / M_IN_K / BUFFER_NORM_FACTOR # 10 sec # state[4, :A_DIM] = np.array(next_video_chunk_sizes) / M_IN_K / M_IN_K # mega byte # state[5, -1] = np.minimum(video_chunk_remain, CHUNK_TIL_VIDEO_END_CAP) / float(CHUNK_TIL_VIDEO_END_CAP) state[0, -1] = float(video_chunk_size) / float( delay) / M_IN_K # kilo byte / ms state[1, -1] = buffer_size / BUFFER_NORM_FACTOR # 6 sec state[2, :4] = np.array(actual_track_index) state[3, -1] = VIDEO_BIT_RATE[bit_rate] / float( np.max(VIDEO_BIT_RATE)) # last chunk's bitrate # compute action probability vector action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN))) action_cumsum = np.cumsum(action_prob) bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() # Note: we need to discretize the probability into 1/RAND_RANGE steps, # because there is an intrinsic discrepancy in passing single state and batch states entropy_record.append(a3c.compute_entropy(action_prob[0])) # log time_stamp, bit_rate, buffer_size, reward log_file.write('time_stamp: ' + str(time_stamp) + '\t' + 'VIDEO_BIT_RATE: ' + str(VIDEO_BIT_RATE[bit_rate]) + '\t' + 'buffer_size: ' + str(buffer_size) + '\t' + 'rebuf: ' + str(rebuf) + '\t' + 'video_chunk_size: ' + str(video_chunk_size) + '\t' + 'delay: ' + str(delay) + '\t' + 'avg throughtput: ' + str(video_chunk_size / delay) + '\t' + 'reward: ' + str(reward) + '\n') log_file.flush() # report experience to the coordinator if len(r_batch) >= TRAIN_SEQ_LEN or end_of_video: exp_queue.put([ s_batch[1:], # ignore the first chuck a_batch[1:], # since we don't have the r_batch[1:], # control over it end_of_video, { 'entropy': entropy_record } ]) # synchronize the network parameters from the coordinator actor_net_params, critic_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) del s_batch[:] del a_batch[:] del r_batch[:] del entropy_record[:] log_file.write( '\n') # so that in the log we know where video ends # store the state and action into batches if end_of_video: last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY # use the default action here action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch.append(np.zeros((S_INFO, S_LEN))) a_batch.append(action_vec) else: s_batch.append(state) action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 a_batch.append(action_vec)
parser = argparse.ArgumentParser("Record states.") parser.add_argument("-s", help="state file", type=str, required=True) parser.add_argument("-o", help="output folder", type=str, required=True) args = parser.parse_args() if not os.path.exists(args.o): os.makedirs(args.o) # INITIALIZE ROSNODE rospy.init_node("test_node", anonymous=True) rate = rospy.Rate(100) rospy.sleep(1.0) robot = torobo_wrapper.Torobo() rospy.sleep(1.0) # INITIALIZE ENVIRONMENT objects = ["target_plate", "small_cube"] random_ranges = { "target_plate": np.array([[0.32, 0.52], [0.30, 0.50], [1.125, 1.125]]), "small_cube": np.array([[0.32, 0.52], [0.0, 0.15], [1.155, 1.155]]), } world = env.Environment(robot=robot, objects=objects, rng_ranges=random_ranges) rospy.sleep(0.5) states = torch.load(args.s) for i, s in enumerate(states): s = s.tolist() world.set_model_state("target_plate", s[:3], s[3:7]) world.set_model_state("small_cube", s[7:10], s[10:14]) os.system("import -window Gazebo %s/%d.jpg" % (args.o, i))
def agent(agent_id, all_cooked_time, all_cooked_bw, net_params_queue, exp_queue): net_env = env.Environment(all_cooked_time=all_cooked_time, all_cooked_bw=all_cooked_bw, random_seed=agent_id) with tf.Session() as sess, open(LOG_FILE + '_agent_' + str(agent_id), 'w') as log_file: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) # initial synchronization of the network parameters from the coordinator actor_net_params, critic_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch = [np.zeros((S_INFO, S_LEN))] a_batch = [action_vec] r_batch = [] entropy_record = [] time_stamp = 0 while True: # experience video streaming forever # the action is from the last decision # this is to make the framework similar to the real delay, sleep_time, buffer_size, rebuf, \ video_chunk_size, next_video_chunk_sizes, \ end_of_video, video_chunk_remain = \ net_env.get_video_chunk(bit_rate) time_stamp += delay # in ms time_stamp += sleep_time # in ms # -- linear reward -- # reward is video quality - rebuffer penalty - smoothness # reward = \ # VIDEO_BIT_RATE[bit_rate] / M_IN_K \ # - REBUF_PENALTY * rebuf \ # - SMOOTH_PENALTY * np.abs(VIDEO_BIT_RATE[bit_rate] - # VIDEO_BIT_RATE[last_bit_rate]) / M_IN_K # -- log scale reward -- # log_bit_rate = np.log(VIDEO_BIT_RATE[bit_rate] / float(VIDEO_BIT_RATE[-1])) #log_last_bit_rate = np.log(VIDEO_BIT_RATE[last_bit_rate] / float(VIDEO_BIT_RATE[-1])) #reward = log_bit_rate \ # - REBUF_PENALTY * rebuf \ # - SMOOTH_PENALTY * np.abs(log_bit_rate - log_last_bit_rate) # -- HD reward -- reward = HD_REWARD[bit_rate] \ - REBUF_PENALTY * rebuf \ - SMOOTH_PENALTY * np.abs(HD_REWARD[bit_rate] - HD_REWARD[last_bit_rate]) r_batch.append(reward) last_bit_rate = bit_rate # retrieve previous state if len(s_batch) == 0: state = [np.zeros((S_INFO, S_LEN))] else: state = np.array(s_batch[-1], copy=True) # dequeue history record state = np.roll(state, -1, axis=1) # this should be S_INFO number of terms state[0, -1] = VIDEO_BIT_RATE[bit_rate] / float(np.max(VIDEO_BIT_RATE)) # last quality state[1, -1] = buffer_size / BUFFER_NORM_FACTOR # 10 sec state[2, -1] = float(video_chunk_size) / float(delay) / M_IN_K # kilo byte / ms state[3, -1] = float(delay) / M_IN_K / BUFFER_NORM_FACTOR # 10 sec state[4, :A_DIM] = np.array(next_video_chunk_sizes) / M_IN_K / M_IN_K # mega byte state[5, -1] = np.minimum(video_chunk_remain, CHUNK_TIL_VIDEO_END_CAP) / float(CHUNK_TIL_VIDEO_END_CAP) # compute action probability vector action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN))) action_cumsum = np.cumsum(action_prob) bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() # Note: we need to discretize the probability into 1/RAND_RANGE steps, # because there is an intrinsic discrepancy in passing single state and batch states entropy_record.append(a3c.compute_entropy(action_prob[0])) # log time_stamp, bit_rate, buffer_size, reward log_file.write(str(time_stamp) + '\t' + str(VIDEO_BIT_RATE[bit_rate]) + '\t' + str(buffer_size) + '\t' + str(rebuf) + '\t' + str(video_chunk_size) + '\t' + str(delay) + '\t' + str(reward) + '\n') log_file.flush() # report experience to the coordinator if len(r_batch) >= TRAIN_SEQ_LEN or end_of_video: exp_queue.put([s_batch[1:], # ignore the first chuck a_batch[1:], # since we don't have the r_batch[1:], # control over it end_of_video, {'entropy': entropy_record}]) # synchronize the network parameters from the coordinator actor_net_params, critic_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) del s_batch[:] del a_batch[:] del r_batch[:] del entropy_record[:] log_file.write('\n') # so that in the log we know where video ends # store the state and action into batches if end_of_video: last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY # use the default action here action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch.append(np.zeros((S_INFO, S_LEN))) a_batch.append(action_vec) else: s_batch.append(state) action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 a_batch.append(action_vec)
def agent(agent_id, all_cooked_time, all_cooked_bw, net_params_queue, exp_queue, model_type): torch.set_num_threads(1) net_env = env.Environment(all_cooked_time=all_cooked_time, all_cooked_bw=all_cooked_bw, random_seed=agent_id) with open(LOG_FILE + '_agent_' + str(agent_id), 'w') as log_file: net = A3C(NO_CENTRAL, model_type, [S_INFO, S_LEN], A_DIM, ACTOR_LR_RATE, CRITIC_LR_RATE) # initial synchronization of the network parameters from the coordinator time_stamp = 0 for epoch in range(TOTALEPOCH): actor_net_params = net_params_queue.get() net.hardUpdateActorNetwork(actor_net_params) last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY s_batch = [] a_batch = [] r_batch = [] entropy_record = [] state = torch.zeros((1, S_INFO, S_LEN)) # the action is from the last decision # this is to make the framework similar to the real delay, sleep_time, buffer_size, rebuf, \ video_chunk_size, next_video_chunk_sizes, \ end_of_video, video_chunk_remain = \ net_env.get_video_chunk(bit_rate) time_stamp += delay # in ms time_stamp += sleep_time # in ms while not end_of_video and len(s_batch) < TRAIN_SEQ_LEN: last_bit_rate = bit_rate state = state.clone().detach() state = torch.roll(state, -1, dims=-1) state[0, 0, -1] = VIDEO_BIT_RATE[bit_rate] / float( np.max(VIDEO_BIT_RATE)) # last quality state[0, 1, -1] = buffer_size / BUFFER_NORM_FACTOR # 10 sec state[0, 2, -1] = float(video_chunk_size) / float( delay) / M_IN_K # kilo byte / ms state[ 0, 3, -1] = float(delay) / M_IN_K / BUFFER_NORM_FACTOR # 10 sec state[0, 4, :A_DIM] = torch.tensor( next_video_chunk_sizes) / M_IN_K / M_IN_K # mega byte state[0, 5, -1] = min( video_chunk_remain, CHUNK_TIL_VIDEO_END_CAP) / float(CHUNK_TIL_VIDEO_END_CAP) bit_rate = net.actionSelect(state) # Note: we need to discretize the probability into 1/RAND_RANGE steps, # because there is an intrinsic discrepancy in passing single state and batch states delay, sleep_time, buffer_size, rebuf, \ video_chunk_size, next_video_chunk_sizes, \ end_of_video, video_chunk_remain = \ net_env.get_video_chunk(bit_rate) reward = VIDEO_BIT_RATE[bit_rate] / M_IN_K \ - REBUF_PENALTY * rebuf \ - SMOOTH_PENALTY * np.abs(VIDEO_BIT_RATE[bit_rate] - VIDEO_BIT_RATE[last_bit_rate]) / M_IN_K s_batch.append(state) a_batch.append(bit_rate) r_batch.append(reward) entropy_record.append(3) # log time_stamp, bit_rate, buffer_size, reward log_file.write( str(time_stamp) + '\t' + str(VIDEO_BIT_RATE[bit_rate]) + '\t' + str(buffer_size) + '\t' + str(rebuf) + '\t' + str(video_chunk_size) + '\t' + str(delay) + '\t' + str(reward) + '\n') log_file.flush() exp_queue.put([ s_batch, # ignore the first chuck a_batch, # since we don't have the r_batch, # control over it end_of_video, { 'entropy': entropy_record } ]) log_file.write('\n') # so that in the log we know where video ends
args = parser.parse_args() # loading/problem load = HalfBeam(nelx, nely) # optimizer verbose = True fesolver = CooFESolver(verbose=verbose) optimizer = None density_constraint = None if args.optimizer: if str(args.optimizer) == 'mas': optimizer = env.Environment(fesolver, young, poisson, verbose=verbose) # constraints density_constraint = DensityConstraint(volume_frac=1.0, density_min=xmin, density_max=xmax) if str(args.optimizer) == 'mas_ke': optimizer = env_ke.Environment(fesolver, young, poisson, verbose=verbose) # constraints density_constraint = DensityConstraint(volume_frac=1.0, density_min=xmin, density_max=xmax) if str(args.optimizer) == 'oc':
if done: self.dispatch.stop() self.env.reset() self.solver.experience_replay(iteration, self.episodes) self.solver.save_model() if (self.episodes % 2 == 0): self.solver.update_model() self.episodes += 1 # ####################### # # Main control center # # ####################### # This object centralizes everything theDispatcher = dispatcher.Dispatcher() # Provide a new environment (maze + agent) theDispatcher.setEnvironment(env.Environment(12, 8, 15)) # Provide also the simulation stepper, which needs access to the # agent and maze in the dispatcher. theDispatcher.setStepper(TrainSolver(theDispatcher)) # Start the GUI and run it until quit is selected # (Remember Ctrl+\ forces python to quit, in case it is necessary) theDispatcher.run()