def main(): pa = parameters.Parameters() # 初始化参数 type_exp = 'pg_re' # 'pg_su' 'pg_su_compact' 'v_su', 'pg_v_re', 'pg_re', q_re', 'test' pg_resume = None v_resume = None q_resume = None log = None render = False try: opts, args = getopt.getopt( sys.argv[1:], "hi:o:", ["exp_type=", "num_res=", "num_nw=", "simu_len=", "num_ex=", "num_seq_per_batch=", "eps_max_len=", "num_epochs=", "time_horizon=", "res_slot=", "max_job_len=", "max_job_size=", "new_job_rate=", "dist=", "lr_rate=", "ba_size=", "pg_re=", "v_re=", "q_re=", "out_freq=", "ofile=", "log=", "render=", "unseen="]) except getopt.GetoptError: script_usage() sys.exit(2) for opt, arg in opts: if opt == '-h': script_usage() sys.exit() elif opt in ("-e", "--exp_type"): type_exp = arg elif opt in ("-n", "--num_res"): pa.num_res = int(arg) elif opt in ("-w", "--num_nw"): pa.num_nw = int(arg) elif opt in ("-s", "--simu_len"): pa.simu_len = int(arg) elif opt in ("-n", "--num_ex"): pa.num_ex = int(arg) elif opt in ("-sp", "--num_seq_per_batch"): pa.num_seq_per_batch = int(arg) elif opt in ("-el", "--eps_max_len"): pa.episode_max_length = int(arg) elif opt in ("-ne", "--num_epochs"): pa.num_epochs = int(arg) elif opt in ("-t", "--time_horizon"): pa.time_horizon = int(arg) elif opt in ("-rs", "--res_slot"): pa.res_slot = int(arg) elif opt in ("-ml", "--max_job_len"): pa.max_job_len = int(arg) elif opt in ("-ms", "--max_job_size"): pa.max_job_size = int(arg) elif opt in ("-nr", "--new_job_rate"): pa.new_job_rate = float(arg) elif opt in ("-d", "--dist"): pa.discount = float(arg) elif opt in ("-l", "--lr_rate"): pa.lr_rate = float(arg) elif opt in ("-b", "--ba_size"): pa.batch_size = int(arg) elif opt in ("-p", "--pg_re"): pg_resume = arg elif opt in ("-v", "--v_re"): v_resume = arg elif opt in ("-q", "--q_re"): q_resume = arg elif opt in ("-f", "--out_freq"): pa.output_freq = int(arg) elif opt in ("-o", "--ofile"): pa.output_filename = arg elif opt in ("-lg", "--log"): log = arg elif opt in ("-r", "--render"): render = (arg == 'True') elif opt in ("-u", "--unseen"): pa.generate_unseen = (arg == 'True') else: script_usage() sys.exit() pa.compute_dependent_parameters() if type_exp == 'pg_su': pg_su.launch(pa, pg_resume, render, repre='image', end='all_done') elif type_exp == 'v_su': v_su.launch(pa, v_resume, render) elif type_exp == 'pg_re': pg_re.launch(pa, pg_resume, render, repre='image', end='all_done') elif type_exp == 'pg_v_re': pg_v_re.launch(pa, pg_resume, v_resume, render) elif type_exp == 'test': # quick_test.launch(pa, pg_resume, render) slow_down_cdf.launch(pa, pg_resume, render, True) # elif type_exp == 'q_re': # q_re.launch(pa, q_resume, render) else: print("Error: unkown experiment type " + str(type_exp)) exit(1)
def launch(pa, pg_resume=None, render=False, repre='image', end='no_new_job'): # ---------------------------- print("Preparing for workers...") # ---------------------------- pg_learners = [] envs = [] nw_len_seqs, nw_size_seqs = job_distribution.generate_sequence_work( pa, seed=42) ### create sequence of environments for each of the num_ex job sets/sequences for ex in xrange(pa.num_ex): print "-prepare for env-", ex env = environment.Env(pa, nw_len_seqs=nw_len_seqs, nw_size_seqs=nw_size_seqs, render=False, repre=repre, end=end) env.seq_no = ex envs.append(env) ### generate sequence of NNs for each batch, each of which is a a policy gradient agent for ex in xrange(pa.batch_size + 1): # last worker for updating the parameters print "-prepare for worker-", ex pg_learner = pg_network.PGLearner(pa) if pg_resume is not None: net_handle = open(pg_resume, 'rb') net_params = cPickle.load(net_handle) pg_learner.set_net_params(net_params) pg_learners.append(pg_learner) accums = init_accums(pg_learners[pa.batch_size]) # -------------------------------------- print("Preparing for reference data...") # -------------------------------------- ref_discount_rews, ref_slow_down = slow_down_cdf.launch(pa, pg_resume=None, render=False, plot=False, repre=repre, end=end) mean_rew_lr_curve = [] max_rew_lr_curve = [] slow_down_lr_curve = [] # -------------------------------------- print("Start training...") # -------------------------------------- timer_start = time.time() for iteration in xrange(1, pa.num_epochs): ### use a thread for each use manager to share results across threads ps = [] # threads manager = Manager() # managing return results manager_result = manager.list([]) ex_indices = range(pa.num_ex) np.random.shuffle(ex_indices) all_eprews = [] grads_all = [] loss_all = [] eprews = [] eplens = [] all_slowdown = [] all_entropy = [] ex_counter = 0 ### for each jobset for ex in xrange(pa.num_ex): ex_idx = ex_indices[ex] ### evaluate several instances of trajectories for set of PG agents p = Process(target=get_traj_worker, args=( pg_learners[ex_counter], envs[ex_idx], pa, manager_result, )) ps.append(p) ex_counter += 1 ## if ex_counter >= pa.batch_size or ex == pa.num_ex - 1: print ex, "out of", pa.num_ex ex_counter = 0 for p in ps: p.start() for p in ps: p.join() result = [] # convert list from shared memory for r in manager_result: result.append(r) ps = [] manager_result = manager.list([]) all_ob = concatenate_all_ob_across_examples( [r["all_ob"] for r in result], pa) all_action = np.concatenate([r["all_action"] for r in result]) all_adv = np.concatenate([r["all_adv"] for r in result]) # Do policy gradient update step, using the first agent # put the new parameter in the last 'worker', then propagate the update at the end grads = pg_learners[pa.batch_size].get_grad( all_ob, all_action, all_adv) grads_all.append(grads) all_eprews.extend([r["all_eprews"] for r in result]) eprews.extend(np.concatenate([r["all_eprews"] for r in result ])) # episode total rewards eplens.extend(np.concatenate([r["all_eplens"] for r in result ])) # episode lengths all_slowdown.extend( np.concatenate([r["all_slowdown"] for r in result])) all_entropy.extend( np.concatenate([r["all_entropy"] for r in result])) # assemble gradients grads = grads_all[0] for i in xrange(1, len(grads_all)): for j in xrange(len(grads)): grads[j] += grads_all[i][j] # propagate network parameters to others params = pg_learners[pa.batch_size].get_params() rmsprop_updates_outside(grads, params, accums, pa.lr_rate, pa.rms_rho, pa.rms_eps) for i in xrange(pa.batch_size + 1): pg_learners[i].set_net_params(params) timer_end = time.time() print "-----------------" print "Iteration: \t %i" % iteration print "NumTrajs: \t %i" % len(eprews) print "NumTimesteps: \t %i" % np.sum(eplens) # print "Loss: \t %s" % np.mean(loss_all) print "MaxRew: \t %s" % np.average([np.max(rew) for rew in all_eprews]) print "MeanRew: \t %s +- %s" % (np.mean(eprews), np.std(eprews)) print "MeanSlowdown: \t %s" % np.mean(all_slowdown) print "MeanLen: \t %s +- %s" % (np.mean(eplens), np.std(eplens)) print "MeanEntropy \t %s" % (np.mean(all_entropy)) print "Elapsed time\t %s" % (timer_end - timer_start), "seconds" print "-----------------" timer_start = time.time() max_rew_lr_curve.append(np.average([np.max(rew) for rew in all_eprews])) mean_rew_lr_curve.append(np.mean(eprews)) slow_down_lr_curve.append(np.mean(all_slowdown)) if iteration % pa.output_freq == 0: param_file = open( pa.output_filename + '_' + str(iteration) + '.pkl', 'wb') cPickle.dump(pg_learners[pa.batch_size].get_params(), param_file, -1) param_file.close() pa.unseen = True slow_down_cdf.launch(pa, pa.output_filename + '_' + str(iteration) + '.pkl', render=False, plot=True, repre=repre, end=end) pa.unseen = False # test on unseen examples plot_lr_curve(pa.output_filename, max_rew_lr_curve, mean_rew_lr_curve, slow_down_lr_curve, ref_discount_rews, ref_slow_down)
def launch(pa, pg_resume=None, render=False, repre='image', end='no_new_job'): env = environment.Env(pa, render=render, repre=repre, end=end) pg_learner = pg_network.PGLearner(pa) if pg_resume is not None: net_handle = open(pg_resume, 'rb') net_params = cPickle.load(net_handle) pg_learner.set_net_params(net_params) # ---------------------------- print("Preparing for data...") # ---------------------------- ref_discount_rews, ref_slow_down = slow_down_cdf.launch(pa, pg_resume=None, render=False, plot=False, repre=repre, end=end) mean_rew_lr_curve = [] max_rew_lr_curve = [] slow_down_lr_curve = [] timer_start = time.time() for iteration in range(pa.num_epochs): all_ob = [] all_action = [] all_adv = [] all_eprews = [] all_eplens = [] all_slowdown = [] all_entropy = [] # go through all examples for ex in range(pa.num_ex): # Collect trajectories until we get timesteps_per_batch total timesteps trajs = [] for i in range(pa.num_seq_per_batch): traj = get_traj(pg_learner, env, pa.episode_max_length) trajs.append(traj) # roll to next example env.seq_no = (env.seq_no + 1) % env.pa.num_ex all_ob.append(concatenate_all_ob(trajs, pa)) # Compute discounted sums of rewards rets = [discount(traj["reward"], pa.discount) for traj in trajs] maxlen = max(len(ret) for ret in rets) padded_rets = [ np.concatenate([ret, np.zeros(maxlen - len(ret))]) for ret in rets ] # Compute time-dependent baseline baseline = np.mean(padded_rets, axis=0) # Compute advantage function advs = [ret - baseline[:len(ret)] for ret in rets] all_action.append( np.concatenate([traj["action"] for traj in trajs])) all_adv.append(np.concatenate(advs)) all_eprews.append( np.array([ discount(traj["reward"], pa.discount)[0] for traj in trajs ])) # episode total rewards all_eplens.append(np.array([len(traj["reward"]) for traj in trajs])) # episode lengths # All Job Stat enter_time, finish_time, job_len = process_all_info(trajs) finished_idx = (finish_time >= 0) all_slowdown.append( (finish_time[finished_idx] - enter_time[finished_idx]) / job_len[finished_idx]) # Action prob entropy all_entropy.append(np.concatenate([traj["entropy"]])) all_ob = concatenate_all_ob_across_examples(all_ob, pa) all_action = np.concatenate(all_action) all_adv = np.concatenate(all_adv) # Do policy gradient update step loss = pg_learner.train(all_ob, all_action, all_adv) eprews = np.concatenate(all_eprews) # episode total rewards eplens = np.concatenate(all_eplens) # episode lengths all_slowdown = np.concatenate(all_slowdown) all_entropy = np.concatenate(all_entropy) timer_end = time.time() print "-----------------" print "Iteration: \t %i" % iteration print "NumTrajs: \t %i" % len(eprews) print "NumTimesteps: \t %i" % np.sum(eplens) print "Loss: \t %s" % loss print "MaxRew: \t %s" % np.average([np.max(rew) for rew in all_eprews]) print "MeanRew: \t %s +- %s" % (eprews.mean(), eprews.std()) print "MeanSlowdown: \t %s" % np.mean(all_slowdown) print "MeanLen: \t %s +- %s" % (eplens.mean(), eplens.std()) print "MeanEntropy \t %s" % (np.mean(all_entropy)) print "Elapsed time\t %s" % (timer_end - timer_start), "seconds" print "-----------------" timer_start = time.time() max_rew_lr_curve.append(np.average([np.max(rew) for rew in all_eprews])) mean_rew_lr_curve.append(eprews.mean()) slow_down_lr_curve.append(np.mean(all_slowdown)) if iteration % pa.output_freq == 0: param_file = open( pa.output_filename + '_' + str(iteration) + '.pkl', 'wb') cPickle.dump(pg_learner.get_params(), param_file, -1) param_file.close() slow_down_cdf.launch(pa, pa.output_filename + '_' + str(iteration) + '.pkl', render=False, plot=True, repre=repre, end=end) plot_lr_curve(pa.output_filename, max_rew_lr_curve, mean_rew_lr_curve, slow_down_lr_curve, ref_discount_rews, ref_slow_down)
def launch(pa, pg_resume=None, render=False, repre='image', end='no_new_job'): # ---------------------------- print("Preparing for workers...") # ---------------------------- # dimension of state space # NOTE: we have to flatten the images before sending them into the network... state_dim = pa.network_input_height * pa.network_input_width # number of actions num_actions = pa.network_output_dim # initialize the q networks sess = tf.Session() optimizer = tf.train.RMSPropOptimizer(learning_rate=0.0001, decay=0.9) q_learner = DeepQLearner(session=sess, optimizer=optimizer, q_network=build_q_learner, state_dim=state_dim, num_actions=num_actions, discount_factor=pa.discount) envs = [] nw_len_seqs = job_distribution.generate_sequence_work(pa, seed=42) ### create sequence of environments for each of the num_ex job sets/sequences for ex in xrange(pa.num_ex): print "-prepare for env-", ex env = environment.Env(pa, nw_len_seqs=nw_len_seqs, render=False, repre=repre, end=end) env.seq_no = ex envs.append(env) # ### generate sequence of NNs for each batch, each of which is a a policy gradient agent # for ex in xrange(pa.batch_size + 1): # last worker for updating the parameters # print "-prepare for worker-", ex # pg_learner = pg_network.PGLearner(pa) # if pg_resume is not None: # net_handle = open(pg_resume, 'rb') # net_params = cPickle.load(net_handle) # pg_learner.set_net_params(net_params) # pg_learners.append(pg_learner) # accums = init_accums(pg_learners[pa.batch_size]) # -------------------------------------- print("Preparing for reference data...") # -------------------------------------- ref_discount_rews, ref_slow_down = slow_down_cdf.launch(pa, pg_resume=None, render=False, plot=False, repre=repre, end=end) mean_rew_lr_curve = [] max_rew_lr_curve = [] slow_down_lr_curve = [] # -------------------------------------- print("Start training...") # -------------------------------------- timer_start = time.time() for iteration in xrange(1, pa.num_epochs): ### use a thread for each use manager to share results across threads # ps = [] # threads # manager = Manager() # managing return results # manager_result = manager.list([]) ex_indices = range(pa.num_ex) np.random.shuffle(ex_indices) all_eprews = [] loss_all = [] eprews = [] eplens = [] all_slowdown = [] ex_counter = 0 ### for each jobset for ex in xrange(pa.num_ex): ex_idx = ex_indices[ex] current_env = envs[ex_idx] man_result = [] get_traj_worker(q_learner, current_env, pa, man_result) ### evaluate several instances of trajectories for set of PG agents # p = Process(target=get_traj_worker, # args=(pg_learners[ex_counter], envs[ex_idx], pa, manager_result, )) # ps.append(p) ex_counter += 1 all_eprews.extend([r["all_eprews"] for r in man_result]) eprews.extend(np.concatenate([r["all_eprews"] for r in man_result])) # episode total rewards eplens.extend(np.concatenate([r["all_eplens"] for r in man_result])) # episode lengths all_slowdown.extend(np.concatenate([r["all_slowdown"] for r in man_result])) ## # if ex_counter >= pa.batch_size or ex == pa.num_ex - 1: # print ex, "out of", pa.num_ex # ex_counter = 0 # for p in ps: # p.start() # for p in ps: # p.join() # result = [] # convert list from shared memory # for r in manager_result: # result.append(r) # ps = [] # manager_result = manager.list([]) # all_ob = concatenate_all_ob_across_examples([r["all_ob"] for r in result], pa) # all_action = np.concatenate([r["all_action"] for r in result]) # all_adv = np.concatenate([r["all_adv"] for r in result]) # all_eprews.extend([r["all_eprews"] for r in result]) # eprews.extend(np.concatenate([r["all_eprews"] for r in result])) # episode total rewards # eplens.extend(np.concatenate([r["all_eplens"] for r in result])) # episode lengths # all_slowdown.extend(np.concatenate([r["all_slowdown"] for r in result])) # # assemble gradients # grads = grads_all[0] # for i in xrange(1, len(grads_all)): # for j in xrange(len(grads)): # grads[j] += grads_all[i][j] # # propagate network parameters to others # params = pg_learners[pa.batch_size].get_params() # rmsprop_updates_outside(grads, params, accums, pa.lr_rate, pa.rms_rho, pa.rms_eps) # for i in xrange(pa.batch_size + 1): # pg_learners[i].set_net_params(params) timer_end = time.time() print "-----------------" print "Iteration: \t %i" % iteration print "NumTrajs: \t %i" % len(eprews) print "NumTimesteps: \t %i" % np.sum(eplens) # print "Loss: \t %s" % np.mean(loss_all) print "MaxRew: \t %s" % np.average([np.max(rew) for rew in all_eprews]) print "MeanRew: \t %s +- %s" % (np.mean(eprews), np.std(eprews)) print "MeanSlowdown: \t %s" % np.mean(all_slowdown) print "MeanLen: \t %s +- %s" % (np.mean(eplens), np.std(eplens)) print "Elapsed time\t %s" % (timer_end - timer_start), "seconds" print "-----------------" timer_start = time.time() max_rew_lr_curve.append(np.average([np.max(rew) for rew in all_eprews])) mean_rew_lr_curve.append(np.mean(eprews)) slow_down_lr_curve.append(np.mean(all_slowdown)) if iteration % pa.output_freq == 0: # param_file = open(pa.output_filename + '_' + str(iteration) + '.pkl', 'wb') # cPickle.dump(pg_learners[pa.batch_size].get_params(), param_file, -1) # param_file.close() pa.unseen = True slow_down_cdf.launch(pa, pa.output_filename + '_' + str(iteration) + '.pkl', render=False, plot=True, repre=repre, end=end, q_resume=q_learner) pa.unseen = False # test on unseen examples plot_lr_curve(pa.output_filename, max_rew_lr_curve, mean_rew_lr_curve, slow_down_lr_curve, ref_discount_rews, ref_slow_down)
def launch(pa, pg_resume=None, render=False, repre='image', end='no_new_job'): # ---------------------------- print("Preparing for workers...") # ---------------------------- pg_learners = [] envs = [] nw_len_seqs, nw_size_seqs = job_distribution.generate_sequence_work( pa, seed=42) for ex in range(pa.num_ex): print("-prepare for env-", ex) env = environment.Env(pa, nw_len_seqs=nw_len_seqs, nw_size_seqs=nw_size_seqs, render=False, repre=repre, end=end) env.seq_no = ex envs.append(env) print("-prepare for worker-") rl = Policy_Network.PolicyGradient(n_actions=pa.network_output_dim, n_features=pa.network_input_height * pa.network_input_width, learning_rate=0.02) print("policy network params count: ", rl.get_num_params()) # pg_learner = pg_network.PGLearner(pa) # if pg_resume is not None: # net_handle = open(pg_resume, 'rb') # net_params = cPickle.load(net_handle) # pg_learner.set_net_params(net_params) # pg_learners.append(pg_learner) if pg_resume is not None: rl.load_data(pg_resume) # accums = init_accums(pg_learners[pa.batch_size]) # -------------------------------------- print("Preparing for reference data...") # -------------------------------------- ref_discount_rews, ref_slow_down = slow_down_cdf.launch(pa, pg_resume=None, render=False, plot=False, repre=repre, end=end) mean_rew_lr_curve = [] max_rew_lr_curve = [] slow_down_lr_curve = [] # -------------------------------------- print("Start training...") # -------------------------------------- timer_start = time.time() for iteration in range(1, pa.num_epochs): ex_indices = list(range(pa.num_ex)) np.random.shuffle(ex_indices) all_eprews = [] eprews = [] eplens = [] all_slowdown = [] eprewlist = [] eplenlist = [] slowdownlist = [] losslist = [] ex_counter = 0 for ex in range(pa.num_ex): ex_idx = ex_indices[ex] eprew, eplen, slowdown, all_ob, all_action, all_adv = get_traj_worker( rl, envs[ex_idx], pa) eprewlist.append(eprew) eplenlist.append(eplen) slowdownlist.append(slowdown) loss = rl.learn(all_ob, all_action, all_adv) losslist.append(loss) ex_counter += 1 if ex_counter >= pa.batch_size or ex == pa.num_ex - 1: print("\n\n") ex_counter = 0 # all_eprews.extend([r["all_eprews"] for r in result]) # eprews.extend(np.concatenate([r["all_eprews"] for r in result])) # episode total rewards # eplens.extend(np.concatenate([r["all_eplens"] for r in result])) # episode lengths # all_slowdown.extend(np.concatenate([r["all_slowdown"] for r in result])) # assemble gradients # grads = grads_all[0] # for i in range(1, len(grads_all)): # for j in range(len(grads)): # grads[j] += grads_all[i][j] # propagate network parameters to others # params = pg_learners[pa.batch_size].get_params() # rmsprop_updates_outside(grads, params, accums, pa.lr_rate, pa.rms_rho, pa.rms_eps) # for i in range(pa.batch_size + 1): # pg_learners[i].set_net_params(params) timer_end = time.time() print("-----------------") print("Iteration: \t %i" % iteration) print("NumTrajs: \t %i" % len(eprewlist)) print("NumTimesteps: \t %i" % np.sum(eplenlist)) print("Loss: \t %s" % np.mean(losslist)) print("MaxRew: \t %s" % np.average([np.max(rew) for rew in eprewlist])) print("MeanRew: \t %s +- %s" % (np.mean(eprewlist), np.std(eprewlist))) print("MeanSlowdown: \t %s" % np.mean([np.mean(sd) for sd in slowdownlist])) print("MeanLen: \t %s +- %s" % (np.mean(eplenlist), np.std(eplenlist))) print("Elapsed time\t %s" % (timer_end - timer_start), "seconds") print("-----------------") timer_start = time.time() max_rew_lr_curve.append(np.average([np.max(rew) for rew in eprewlist])) mean_rew_lr_curve.append(np.mean(eprewlist)) slow_down_lr_curve.append(np.mean([np.mean(sd) for sd in slowdownlist])) if iteration % pa.output_freq == 0: rl.save_data(pa.output_filename + '_' + str(iteration)) pa.unseen = True # slow_down_cdf.launch(pa, pa.output_filename + '_' + str(iteration) + '.ckpt', # render=False, plot=True, repre=repre, end=end) pa.unseen = False # test on unseen examples plot_lr_curve(pa.output_filename, max_rew_lr_curve, mean_rew_lr_curve, slow_down_lr_curve, ref_discount_rews, ref_slow_down)
def launch(pa, pg_resume=None, render=True, repre='image', end='no_new_job'): f = open('log/re_log_' + datetime.now().strftime('%Y-%m-%d_%H:%M:%S'), 'a') env = environment.Env(pa, render=render, repre=repre, end=end) pg_learner = pg_network.PGLearner(pa) startIdx = 0 if pg_resume is not None: # and 're' in pg_resume: net_handle = open(pg_resume, 'rb') net_params = cPickle.load(net_handle) pg_learner.set_net_params(net_params) tmp = re.match('.+?(\d+).+', pg_resume) startIdx = int(tmp.group(1)) # ---------------------------- print("\nPreparing for data...") # ---------------------------- ref_discount_rews, ref_slow_down = slow_down_cdf.launch(pa, pg_resume=None, render=True, plot=False, repre=repre, end=end) mean_rew_lr_curve = [] max_rew_lr_curve = [] slow_down_lr_curve = [] timer_start = time.time() print("\nStart reinforcement learning...") for iteration in xrange(startIdx, pa.num_epochs): all_ob = [] all_action = [] all_adv = [] all_eprews = [] all_eplens = [] all_slowdown = [] all_entropy = [] # go through all examples for ex in xrange(pa.num_ex): # Collect trajectories until we get timesteps_per_batch total timesteps trajs = [] for i in xrange(pa.num_seq_per_batch): traj = get_traj(pg_learner, env, pa.episode_max_length) trajs.append(traj) # roll to next example env.seq_no = (env.seq_no + 1) % env.pa.num_ex all_ob.append(concatenate_all_ob(trajs, pa)) # Compute discounted sums of rewards rets = [discount(traj["reward"], pa.discount) for traj in trajs] maxlen = max(len(ret) for ret in rets) padded_rets = [ np.concatenate([ret, np.zeros(maxlen - len(ret))]) for ret in rets ] # Compute time-dependent baseline baseline = np.mean(padded_rets, axis=0) # Compute advantage function advs = [ret - baseline[:len(ret)] for ret in rets] all_action.append( np.concatenate([traj["action"] for traj in trajs])) all_adv.append(np.concatenate(advs)) all_eprews.append( np.array([ discount(traj["reward"], pa.discount)[0] for traj in trajs ])) # episode total rewards all_eplens.append(np.array([len(traj["reward"]) for traj in trajs])) # episode lengths # All Job Stat enter_time, finish_time, job_len = process_all_info(trajs) finished_idx = (finish_time >= 0) all_slowdown.append( (finish_time[finished_idx] - enter_time[finished_idx]) / job_len[finished_idx]) # Action prob entropy all_entropy.append(np.concatenate([traj["entropy"]])) all_ob = concatenate_all_ob_across_examples(all_ob, pa) all_action = np.concatenate(all_action) all_adv = np.concatenate(all_adv) # Do policy gradient update step loss = pg_learner.train(all_ob, all_action, all_adv) eprews = np.concatenate(all_eprews) # episode total rewards eplens = np.concatenate(all_eplens) # episode lengths all_slowdown = np.concatenate(all_slowdown) all_entropy = np.concatenate(all_entropy) timer_end = time.time() print "-----------------" print "Iteration: \t %i" % iteration print "NumTrajs: \t %i" % len(eprews) print "NumTimesteps: \t %i" % np.sum(eplens) print "Loss: \t %s" % loss print "MaxRew: \t %s" % np.average([np.max(rew) for rew in all_eprews]) print "MeanRew: \t %s +- %s" % (eprews.mean(), eprews.std()) print "MeanSlowdown: \t %s" % np.mean(all_slowdown) print "MeanLen: \t %s +- %s" % (eplens.mean(), eplens.std()) print "MeanEntropy \t %s" % (np.mean(all_entropy)) print "Elapsed time\t %s" % (timer_end - timer_start), "seconds" print "-----------------" f.write("-----------------\n") f.write("Iteration: \t %i\n" % (iteration)) f.write("NumTrajs: \t %i\n" % (len(eprews))) f.write("NumTimesteps: \t %i\n" % (np.sum(eplens))) f.write("Loss: \t %s\n".format(loss)) f.write("MaxRew: \t %s\n" % (np.average([np.max(rew) for rew in all_eprews]))) f.write("MeanRew: \t %s +- %s\n" % (np.mean(eprews), np.std(eprews))) f.write("MeanSlowdown: \t %s\n" % (np.mean(all_slowdown))) f.write("MeanLen: \t %s +- %s\n" % (np.mean(eplens), np.std(eplens))) f.write("MeanEntropy \t %s\n" % ((np.mean(all_entropy)))) f.write("Elapsed time\t %s seconds\n" % ((timer_end - timer_start))) f.write("-----------------\n") f.close() timer_start = time.time() max_rew_lr_curve.append(np.average([np.max(rew) for rew in all_eprews])) mean_rew_lr_curve.append(eprews.mean()) slow_down_lr_curve.append(np.mean(all_slowdown)) if iteration % pa.output_freq == 0: param_file = open( pa.output_filename + '_' + str(iteration) + '.pkl', 'wb') cPickle.dump(pg_learner.get_params(), param_file, -1) param_file.close() # added by wjchen, to record accuracy and rewards sample_file = h5py.File('log/re_record_iter'+str(len(slow_down_lr_curve))\ + datetime.now().strftime('%Y-%m-%d_%H:%M')+'.h5', 'w+') sample_file.create_dataset('max_rew_lr_curve', data=max_rew_lr_curve) sample_file.create_dataset('mean_rew_lr_curve', data=mean_rew_lr_curve) sample_file.create_dataset('slow_down_lr_curve', data=slow_down_lr_curve) # ref_dr = sample_file.create_group('ref_discount_rews') for k, v in ref_discount_rews.items(): ref_dr[k] = np.average(v) # ref_sd = sample_file.create_group('ref_slow_down') for k, v in ref_slow_down.items(): ref_sd[k] = np.average(np.concatenate(v)) sample_file.close() slow_down_cdf.launch(pa, pa.output_filename + '_' + str(iteration) + '.pkl', render=True, plot=True, repre=repre, end=end) plot_lr_curve(pa.output_filename, max_rew_lr_curve, mean_rew_lr_curve, slow_down_lr_curve, ref_discount_rews, ref_slow_down)
import slow_down_cdf import parameters import numpy as np pa = parameters.Parameters() pa.simu_len = 20 pa.num_ex = 5 ref_rewards, ref_slow_down = slow_down_cdf.launch(pa, render=False) print '\n---------- Total Discount Rewards ----------' print 'Random2: ' + str(np.average(ref_rewards['Random2'])) print 'SJF2: ' + str(np.average(ref_rewards['SJF2'])) print 'Packer2: ' + str(np.average(ref_rewards['Packer2'])) print 'Tetris2: ' + str(np.average(ref_rewards['Tetris2'])) # print sd[1]['Random2'] # print np.average(np.concatenate(sd[1]['Random2'])) print '\n---------- Average Job Slowdown ----------' print 'Random2: ' + str(np.average(np.concatenate(ref_slow_down['Random2']))) print 'SJF2: ' + str(np.average(np.concatenate(ref_slow_down['SJF2']))) print 'Packer2: ' + str(np.average(np.concatenate(ref_slow_down['Packer2']))) print 'Tetris2: ' + str(np.average(np.concatenate(ref_slow_down['Tetris2'])))
def launch(pa, pg_resume=None, render=False, repre='image', end='no_new_job'): # ---------------------------- print("Preparing for workers...") # ---------------------------- pg_learners = [] envs = [] nw_len_seqs, nw_size_seqs = job_distribution.generate_sequence_work( pa, seed=42) for ex in xrange(pa.num_ex): # number of sequences print "-prepare for env-", ex env = environment.Env(pa, nw_len_seqs=nw_len_seqs, nw_size_seqs=nw_size_seqs, render=True, repre=repre, end=end) env.seq_no = ex envs.append(env) for ex in xrange(pa.batch_size + 1): # last worker for updating the parameters print "-prepare for worker-", ex pg_learner = pg_network.PGLearner(pa) startIndex = 0 if pg_resume is not None: net_handle = open(pg_resume, 'rb') net_params = cPickle.load(net_handle) pg_learner.set_net_params(net_params) startIndex = re.match(pg_resume, '\d+').group() startIndex = int(startIndex) pg_learners.append(pg_learner) accums = init_accums(pg_learners[pa.batch_size]) # -------------------------------------- print("Preparing for reference data...") # -------------------------------------- # Reference examples, get reference discounted rewards and reference slowdown from random, SJF and Tetris algorithms ref_discount_rews, ref_slow_down = slow_down_cdf.launch(pa, pg_resume=None, render=True, plot=False, repre=repre, end=end) mean_rew_lr_curve = [] max_rew_lr_curve = [] slow_down_lr_curve = [] # -------------------------------------- print("Start training...") # -------------------------------------- timer_start = time.time() for iteration in xrange(startIndex, pa.num_epochs): ps = [] # threads manager = Manager() # managing return results manager_result = manager.list([]) ex_indices = range(pa.num_ex) np.random.shuffle(ex_indices) all_eprews = [] grads_all = [] loss_all = [] eprews = [] eplens = [] all_slowdown = [] all_entropy = [] ex_counter = 0 for ex in xrange(pa.num_ex): ex_idx = ex_indices[ex] p = Process(target=get_traj_worker, args=( pg_learners[ex_counter], envs[ex_idx], pa, manager_result, )) ps.append(p) ex_counter += 1 # append pa.num_ex number of Processes in ps until going inside if if ex_counter >= pa.batch_size or ex == pa.num_ex - 1: print ex + 1, "out of", pa.num_ex ex_counter = 0 for p in ps: p.start() for p in ps: p.join() result = [] # convert list from shared memory for r in manager_result: result.append(r) ps = [] manager_result = manager.list([]) all_ob = concatenate_all_ob_across_examples( [r["all_ob"] for r in result], pa) all_action = np.concatenate([r["all_action"] for r in result]) all_adv = np.concatenate([r["all_adv"] for r in result]) # Do policy gradient update step, using the first agent # put the new parameter in the last 'worker', then propagate the update at the end grads = pg_learners[pa.batch_size].get_grad( all_ob, all_action, all_adv) #(states, actions, values) grads_all.append(grads) all_eprews.extend([r["all_eprews"] for r in result]) eprews.extend(np.concatenate([r["all_eprews"] for r in result ])) # episode total rewards eplens.extend(np.concatenate([r["all_eplens"] for r in result ])) # episode lengths all_slowdown.extend( np.concatenate([r["all_slowdown"] for r in result])) all_entropy.extend( np.concatenate([r["all_entropy"] for r in result])) # assemble gradients grads = grads_all[0] for i in xrange(1, len(grads_all)): for j in xrange(len(grads)): grads[j] += grads_all[i][j] # propagate network parameters to others params = pg_learners[pa.batch_size].get_params() rmsprop_updates_outside(grads, params, accums, pa.lr_rate, pa.rms_rho, pa.rms_eps) for i in xrange(pa.batch_size + 1): pg_learners[i].set_net_params(params) timer_end = time.time() print "-----------------" print "Iteration: \t %i" % iteration print "NumTrajs: \t %i" % len(eprews) print "NumTimesteps: \t %i" % np.sum(eplens) # print "Loss: \t %s" % np.mean(loss_all) print "MaxRew: \t %s" % np.average([np.max(rew) for rew in all_eprews]) print "MeanRew: \t %s +- %s" % (np.mean(eprews), np.std(eprews)) print "MeanSlowdown: \t %s" % np.mean(all_slowdown) print "MeanLen: \t %s +- %s" % (np.mean(eplens), np.std(eplens)) print "MeanEntropy \t %s" % (np.mean(all_entropy)) print "Elapsed time\t %s" % (timer_end - timer_start), "seconds" print "-----------------" f = open('log/re_log_' + datetime.now().strftime('%Y-%m-%d_%H:%M:%S'), 'w+') f.write("-----------------\n") f.write("Iteration: \t %i\n" % (iteration)) f.write("NumTrajs: \t %i\n" % (len(eprews))) f.write("NumTimesteps: \t %i\n" % (np.sum(eplens))) # f.write("Loss: \t %s\n".format(loss)) f.write("MaxRew: \t %s\n" % (np.average([np.max(rew) for rew in all_eprews]))) f.write("MeanRew: \t %s +- %s\n" % (np.mean(eprews), np.std(eprews))) f.write("MeanSlowdown: \t %s\n" % (np.mean(all_slowdown))) f.write("MeanLen: \t %s +- %s\n" % (np.mean(eplens), np.std(eplens))) f.write("MeanEntropy \t %s\n" % ((np.mean(all_entropy)))) f.write("Elapsed time\t %s seconds\n" % ((timer_end - timer_start))) f.write("-----------------\n") f.close() timer_start = time.time() max_rew_lr_curve.append(np.average([np.max(rew) for rew in all_eprews])) mean_rew_lr_curve.append(np.mean(eprews)) slow_down_lr_curve.append(np.mean(all_slowdown)) if iteration % pa.output_freq == 0: param_file = open( pa.output_filename + '_' + str(iteration) + '.pkl', 'wb') cPickle.dump(pg_learners[pa.batch_size].get_params(), param_file, -1) param_file.close() # added by wjchen, to record accuracy and rewards sample_file = h5py.File('log/re_record'+str(len(slow_down_lr_curve))\ + datetime.now().strftime('%Y-%m-%d_%H:%M')+'.h5', 'w') sample_file.create_dataset('max_rew_lr_curve', data=max_rew_lr_curve) sample_file.create_dataset('mean_rew_lr_curve', data=mean_rew_lr_curve) sample_file.create_dataset('slow_down_lr_curve', data=slow_down_lr_curve) ref_dr = sample_file.create_group('ref_discount_rews') for k, v in ref_discount_rews.items(): ref_dr[k] = np.average(v) ref_sd = sample_file.create_group('ref_slow_down') for k, v in ref_slow_down.items(): ref_sd[k] = np.average(np.concatenate(v)) sample_file.close() # print ref_slow_down # print ref_discount_rews # print '\n----Reference Slowdown----' for k, v in ref_slow_down.items(): print "{}: {}".format(k, np.average(np.concatenate(v))) print '\n----Reference Discount Reward----' for k, v in ref_discount_rews.items(): print "{}: {}".format(k, np.average(v)) pa.unseen = True slow_down_cdf.launch(pa, pa.output_filename + '_' + str(iteration) + '.pkl', render=True, plot=True, repre=repre, end=end) pa.unseen = False # test on unseen examples plot_lr_curve(pa.output_filename, max_rew_lr_curve, mean_rew_lr_curve, slow_down_lr_curve, ref_discount_rews, ref_slow_down ) # draw average of ref_discount_rews, ref_slow_down
def launch(pa, pg_resume=None, render=False, repre='image', end='no_new_job'): # ---------------------------- print("Preparing for workers...") # ---------------------------- envs = [] nw_len_seqs, nw_size_seqs = job_distribution.generate_sequence_work(pa, seed=42) for ex in range(pa.num_ex): print("-prepare for env-", ex) env = environment.Env(pa, nw_len_seqs=nw_len_seqs, nw_size_seqs=nw_size_seqs, render=False, repre=repre, end=end) env.seq_no = ex envs.append(env) print("-prepare for worker-") sess = tf.Session() actor = actor_critic_brain.Actor(sess, n_features=pa.network_input_height*pa.network_input_width, n_actions=pa.network_output_dim, lr=0.001) critic = actor_critic_brain.Critic(sess, n_features=pa.network_input_height*pa.network_input_width, lr=0.01) sess.run(tf.global_variables_initializer()) if pg_resume is not None: pass # rl.load_data(pg_resume) # accums = init_accums(pg_learners[pa.batch_size]) # -------------------------------------- print("Preparing for reference data...") # -------------------------------------- ref_discount_rews, ref_slow_down = slow_down_cdf.launch(pa, pg_resume=None, render=False, plot=False, repre=repre, end=end) mean_rew_lr_curve = [] max_rew_lr_curve = [] slow_down_lr_curve = [] # -------------------------------------- print("Start training...") # -------------------------------------- timer_start = time.time() for iteration in range(1, pa.num_epochs): ex_indices = list(range(pa.num_ex)) np.random.shuffle(ex_indices) eprewlist = [] eplenlist =[] slowdownlist =[] ex_counter = 0 for ex in range(pa.num_ex): ex_idx = ex_indices[ex] eprew, eplen, slowdown = get_traj_worker(actor, critic, envs[ex_idx], pa) eprewlist.append(eprew) eplenlist.append(eplen) slowdownlist.append(slowdown) ex_counter += 1 if ex_counter >= pa.batch_size or ex == pa.num_ex - 1: print("\n\n") ex_counter = 0 timer_end = time.time() print("-----------------") print("Iteration: \t %i" % iteration) # print("NumTrajs: \t %i" % len(eprewlist)) print("NumTimesteps: \t %i" % np.sum(eplenlist)) # print("MaxRew: \t %s" % np.average([np.max(rew) for rew in eprewlist])) # print("MeanRew: \t %s +- %s" % (np.mean(eprewlist), np.std(eprewlist))) print("MeanSlowdown: \t %s" % np.mean([np.mean(sd) for sd in slowdownlist])) print("MeanLen: \t %s +- %s" % (np.mean(eplenlist), np.std(eplenlist))) print("Elapsed time\t %s" % (timer_end - timer_start), "seconds") print("-----------------") timer_start = time.time() max_rew_lr_curve.append(np.average([np.max(rew) for rew in eprewlist])) mean_rew_lr_curve.append(np.mean(eprewlist)) slow_down_lr_curve.append(np.mean([np.mean(sd) for sd in slowdownlist])) if iteration % pa.output_freq == 0: # rl.save_data(pa.output_filename + '_' + str(iteration)) pa.unseen = True # slow_down_cdf.launch(pa, pa.output_filename + '_' + str(iteration) + '.ckpt', # render=False, plot=True, repre=repre, end=end) pa.unseen = False # test on unseen examples plot_lr_curve(pa.output_filename, max_rew_lr_curve, mean_rew_lr_curve, slow_down_lr_curve, ref_discount_rews, ref_slow_down)