def get_traj(test_type, pa, env, episode_max_length, pg_resume=None, render=False): """ Run agent-environment loop for one whole episode (trajectory) Return dictionary of results """ if test_type == 'PG': # load trained parameters pg_learner = pg_network.PGLearner(pa) net_handle = open(pg_resume, 'rb') net_params = cPickle.load(net_handle) pg_learner.set_net_params(net_params) env.reset() rews = [] ob = env.observe() for _ in xrange(episode_max_length): if test_type == 'PG': a = pg_learner.choose_action(ob) elif test_type == 'Tetris': a = other_agents.get_packer_action(env.machines, env.job_slot) elif test_type == 'SJF': a = other_agents.get_sjf_action(env.machines, env.job_slot) elif test_type == 'Random': a = other_agents.get_random_action(env.machines, env.job_slot) elif test_type == 'SJF2': a = other_agents.get_sjf_action_for_multiple_machines(env.machines, env.job_slot) elif test_type == 'Packer2': a = other_agents.get_packer_action_for_multiple_machines(env.machines, env.job_slot) elif test_type == 'Tetris2': a = other_agents.get_packer_sjf_action_for_multiple_machines(env.machines, env.job_slot, 0.3) elif test_type == 'Random2': a = other_agents.get_random_action_for_multiple_machines(env.machines, env.job_slot) ob, rew, done, info = env.step(a, repeat=True) rews.append(rew) if done: break if render: env.render() # env.render() return np.array(rews), info
def get_traj(test_type, pa, env, episode_max_length, pg_resume=None, render=False, q_agent=None): """ Run agent-environment loop for one whole episode (trajectory) Return dictionary of results """ if test_type == 'PG': # load trained parameters pg_learner = pg_network.PGLearner(pa) net_handle = open(pg_resume, 'rb') net_params = cPickle.load(net_handle) pg_learner.set_net_params(net_params) # Q network elif test_type == 'Q': assert (q_agent is not None) env.reset() rews = [] ob = env.observe() for _ in xrange(episode_max_length): if test_type == 'PG': a = pg_learner.choose_action(ob) elif test_type == 'Random': a = other_agents.get_random_action(env.machine) elif test_type == 'LLQ': a = other_agents.get_llq_action(env.machine) elif test_type == 'Q': state = np.array(list(np.array(ob).flat)) a = q_agent.greedy_policy(state[np.newaxis, :]) ob, rew, done, info = env.step(a, repeat=True) rews.append(rew) if done: break if render: env.render() # env.render() return np.array(rews), info
def launch(pa, pg_resume=None, render=False, repre='image', end='no_new_job'): # ---------------------------- print("Preparing for workers...") # ---------------------------- pg_learners = [] envs = [] nw_len_seqs, nw_size_seqs = job_distribution.generate_sequence_work( pa, seed=42) ### create sequence of environments for each of the num_ex job sets/sequences for ex in xrange(pa.num_ex): print "-prepare for env-", ex env = environment.Env(pa, nw_len_seqs=nw_len_seqs, nw_size_seqs=nw_size_seqs, render=False, repre=repre, end=end) env.seq_no = ex envs.append(env) ### generate sequence of NNs for each batch, each of which is a a policy gradient agent for ex in xrange(pa.batch_size + 1): # last worker for updating the parameters print "-prepare for worker-", ex pg_learner = pg_network.PGLearner(pa) if pg_resume is not None: net_handle = open(pg_resume, 'rb') net_params = cPickle.load(net_handle) pg_learner.set_net_params(net_params) pg_learners.append(pg_learner) accums = init_accums(pg_learners[pa.batch_size]) # -------------------------------------- print("Preparing for reference data...") # -------------------------------------- ref_discount_rews, ref_slow_down = slow_down_cdf.launch(pa, pg_resume=None, render=False, plot=False, repre=repre, end=end) mean_rew_lr_curve = [] max_rew_lr_curve = [] slow_down_lr_curve = [] # -------------------------------------- print("Start training...") # -------------------------------------- timer_start = time.time() for iteration in xrange(1, pa.num_epochs): ### use a thread for each use manager to share results across threads ps = [] # threads manager = Manager() # managing return results manager_result = manager.list([]) ex_indices = range(pa.num_ex) np.random.shuffle(ex_indices) all_eprews = [] grads_all = [] loss_all = [] eprews = [] eplens = [] all_slowdown = [] all_entropy = [] ex_counter = 0 ### for each jobset for ex in xrange(pa.num_ex): ex_idx = ex_indices[ex] ### evaluate several instances of trajectories for set of PG agents p = Process(target=get_traj_worker, args=( pg_learners[ex_counter], envs[ex_idx], pa, manager_result, )) ps.append(p) ex_counter += 1 ## if ex_counter >= pa.batch_size or ex == pa.num_ex - 1: print ex, "out of", pa.num_ex ex_counter = 0 for p in ps: p.start() for p in ps: p.join() result = [] # convert list from shared memory for r in manager_result: result.append(r) ps = [] manager_result = manager.list([]) all_ob = concatenate_all_ob_across_examples( [r["all_ob"] for r in result], pa) all_action = np.concatenate([r["all_action"] for r in result]) all_adv = np.concatenate([r["all_adv"] for r in result]) # Do policy gradient update step, using the first agent # put the new parameter in the last 'worker', then propagate the update at the end grads = pg_learners[pa.batch_size].get_grad( all_ob, all_action, all_adv) grads_all.append(grads) all_eprews.extend([r["all_eprews"] for r in result]) eprews.extend(np.concatenate([r["all_eprews"] for r in result ])) # episode total rewards eplens.extend(np.concatenate([r["all_eplens"] for r in result ])) # episode lengths all_slowdown.extend( np.concatenate([r["all_slowdown"] for r in result])) all_entropy.extend( np.concatenate([r["all_entropy"] for r in result])) # assemble gradients grads = grads_all[0] for i in xrange(1, len(grads_all)): for j in xrange(len(grads)): grads[j] += grads_all[i][j] # propagate network parameters to others params = pg_learners[pa.batch_size].get_params() rmsprop_updates_outside(grads, params, accums, pa.lr_rate, pa.rms_rho, pa.rms_eps) for i in xrange(pa.batch_size + 1): pg_learners[i].set_net_params(params) timer_end = time.time() print "-----------------" print "Iteration: \t %i" % iteration print "NumTrajs: \t %i" % len(eprews) print "NumTimesteps: \t %i" % np.sum(eplens) # print "Loss: \t %s" % np.mean(loss_all) print "MaxRew: \t %s" % np.average([np.max(rew) for rew in all_eprews]) print "MeanRew: \t %s +- %s" % (np.mean(eprews), np.std(eprews)) print "MeanSlowdown: \t %s" % np.mean(all_slowdown) print "MeanLen: \t %s +- %s" % (np.mean(eplens), np.std(eplens)) print "MeanEntropy \t %s" % (np.mean(all_entropy)) print "Elapsed time\t %s" % (timer_end - timer_start), "seconds" print "-----------------" timer_start = time.time() max_rew_lr_curve.append(np.average([np.max(rew) for rew in all_eprews])) mean_rew_lr_curve.append(np.mean(eprews)) slow_down_lr_curve.append(np.mean(all_slowdown)) if iteration % pa.output_freq == 0: param_file = open( pa.output_filename + '_' + str(iteration) + '.pkl', 'wb') cPickle.dump(pg_learners[pa.batch_size].get_params(), param_file, -1) param_file.close() pa.unseen = True slow_down_cdf.launch(pa, pa.output_filename + '_' + str(iteration) + '.pkl', render=False, plot=True, repre=repre, end=end) pa.unseen = False # test on unseen examples plot_lr_curve(pa.output_filename, max_rew_lr_curve, mean_rew_lr_curve, slow_down_lr_curve, ref_discount_rews, ref_slow_down)
def launch(pa, pg_resume=None, render=False, repre='image', end='no_new_job'): env = environment.Env(pa, render=False, repre=repre, end=end) pg_learner = pg_network.PGLearner(pa) if pg_resume is not None: net_handle = open(pg_resume, 'r') net_params = cPickle.load(net_handle) pg_learner.set_net_params(net_params) if pa.evaluate_policy_name == "SJF": evaluate_policy = other_agents.get_sjf_action elif pa.evaluate_policy_name == "PACKER": evaluate_policy = other_agents.get_packer_action else: print("Panic: no policy known to evaluate.") exit(1) # ---------------------------- print("Preparing for data...") # ---------------------------- nw_len_seqs, nw_size_seqs = job_distribution.generate_sequence_work( pa, seed=42) # print 'nw_time_seqs=', nw_len_seqs # print 'nw_size_seqs=', nw_size_seqs mem_alloc = 4 X = np.zeros([ pa.simu_len * pa.num_ex * mem_alloc, 1, pa.network_input_height, pa.network_input_width ], dtype=theano.config.floatX) y = np.zeros(pa.simu_len * pa.num_ex * mem_alloc, dtype='int32') print 'network_input_height=', pa.network_input_height print 'network_input_width=', pa.network_input_width counter = 0 for train_ex in range(pa.num_ex): env.reset() for _ in xrange(pa.episode_max_length): # ---- get current state ---- ob = env.observe() a = evaluate_policy(env.machine, env.job_slot) if counter < pa.simu_len * pa.num_ex * mem_alloc: add_sample(X, y, counter, ob, a) counter += 1 ob, rew, done, info = env.step(a, repeat=True) if done: # hit void action, exit break # roll to next example env.seq_no = (env.seq_no + 1) % env.pa.num_ex num_train = int(0.8 * counter) num_test = int(0.2 * counter) X_train, X_test = X[:num_train], X[num_train:num_train + num_test] y_train, y_test = y[:num_train], y[num_train:num_train + num_test] # Normalization, make sure nothing becomes NaN # X_mean = np.average(X[:num_train + num_test], axis=0) # X_std = np.std(X[:num_train + num_test], axis=0) # # X_train = (X_train - X_mean) / X_std # X_test = (X_test - X_mean) / X_std # ---------------------------- print("Start training...") # ---------------------------- for epoch in xrange(pa.num_epochs): # In each epoch, we do a full pass over the training data: train_err = 0 train_acc = 0 train_batches = 0 start_time = time.time() for batch in iterate_minibatches(X_train, y_train, pa.batch_size, shuffle=True): inputs, targets = batch err, prob_act = pg_learner.su_train(inputs, targets) pg_act = np.argmax(prob_act, axis=1) train_err += err train_acc += np.sum(pg_act == targets) train_batches += 1 # # And a full pass over the test data: test_err = 0 test_acc = 0 test_batches = 0 for batch in iterate_minibatches(X_test, y_test, pa.batch_size, shuffle=False): inputs, targets = batch err, prob_act = pg_learner.su_test(inputs, targets) pg_act = np.argmax(prob_act, axis=1) test_err += err test_acc += np.sum(pg_act == targets) test_batches += 1 # Then we print the results for this epoch: print("Epoch {} of {} took {:.3f}s".format(epoch + 1, pa.num_epochs, time.time() - start_time)) print(" training loss: \t\t{:.6f}".format(train_err / train_batches)) print(" training accuracy:\t\t{:.2f} %".format( train_acc / float(num_train) * 100)) print(" test loss: \t\t{:.6f}".format(test_err / test_batches)) print(" test accuracy: \t\t{:.2f} %".format(test_acc / float(num_test) * 100)) sys.stdout.flush() if epoch % pa.output_freq == 0: net_file = open( pa.output_filename + '_net_file_' + str(epoch) + '.pkl', 'wb') cPickle.dump(pg_learner.return_net_params(), net_file, -1) net_file.close() print("done")
def launch(pa, pg_resume=None, render=False, repre='image', end='no_new_job'): env = environment.Env(pa, render=render, repre=repre, end=end) pg_learner = pg_network.PGLearner(pa) if pg_resume is not None: net_handle = open(pg_resume, 'rb') net_params = cPickle.load(net_handle) pg_learner.set_net_params(net_params) # ---------------------------- print("Preparing for data...") # ---------------------------- ref_discount_rews, ref_slow_down = slow_down_cdf.launch(pa, pg_resume=None, render=False, plot=False, repre=repre, end=end) mean_rew_lr_curve = [] max_rew_lr_curve = [] slow_down_lr_curve = [] timer_start = time.time() for iteration in range(pa.num_epochs): all_ob = [] all_action = [] all_adv = [] all_eprews = [] all_eplens = [] all_slowdown = [] all_entropy = [] # go through all examples for ex in range(pa.num_ex): # Collect trajectories until we get timesteps_per_batch total timesteps trajs = [] for i in range(pa.num_seq_per_batch): traj = get_traj(pg_learner, env, pa.episode_max_length) trajs.append(traj) # roll to next example env.seq_no = (env.seq_no + 1) % env.pa.num_ex all_ob.append(concatenate_all_ob(trajs, pa)) # Compute discounted sums of rewards rets = [discount(traj["reward"], pa.discount) for traj in trajs] maxlen = max(len(ret) for ret in rets) padded_rets = [ np.concatenate([ret, np.zeros(maxlen - len(ret))]) for ret in rets ] # Compute time-dependent baseline baseline = np.mean(padded_rets, axis=0) # Compute advantage function advs = [ret - baseline[:len(ret)] for ret in rets] all_action.append( np.concatenate([traj["action"] for traj in trajs])) all_adv.append(np.concatenate(advs)) all_eprews.append( np.array([ discount(traj["reward"], pa.discount)[0] for traj in trajs ])) # episode total rewards all_eplens.append(np.array([len(traj["reward"]) for traj in trajs])) # episode lengths # All Job Stat enter_time, finish_time, job_len = process_all_info(trajs) finished_idx = (finish_time >= 0) all_slowdown.append( (finish_time[finished_idx] - enter_time[finished_idx]) / job_len[finished_idx]) # Action prob entropy all_entropy.append(np.concatenate([traj["entropy"]])) all_ob = concatenate_all_ob_across_examples(all_ob, pa) all_action = np.concatenate(all_action) all_adv = np.concatenate(all_adv) # Do policy gradient update step loss = pg_learner.train(all_ob, all_action, all_adv) eprews = np.concatenate(all_eprews) # episode total rewards eplens = np.concatenate(all_eplens) # episode lengths all_slowdown = np.concatenate(all_slowdown) all_entropy = np.concatenate(all_entropy) timer_end = time.time() print "-----------------" print "Iteration: \t %i" % iteration print "NumTrajs: \t %i" % len(eprews) print "NumTimesteps: \t %i" % np.sum(eplens) print "Loss: \t %s" % loss print "MaxRew: \t %s" % np.average([np.max(rew) for rew in all_eprews]) print "MeanRew: \t %s +- %s" % (eprews.mean(), eprews.std()) print "MeanSlowdown: \t %s" % np.mean(all_slowdown) print "MeanLen: \t %s +- %s" % (eplens.mean(), eplens.std()) print "MeanEntropy \t %s" % (np.mean(all_entropy)) print "Elapsed time\t %s" % (timer_end - timer_start), "seconds" print "-----------------" timer_start = time.time() max_rew_lr_curve.append(np.average([np.max(rew) for rew in all_eprews])) mean_rew_lr_curve.append(eprews.mean()) slow_down_lr_curve.append(np.mean(all_slowdown)) if iteration % pa.output_freq == 0: param_file = open( pa.output_filename + '_' + str(iteration) + '.pkl', 'wb') cPickle.dump(pg_learner.get_params(), param_file, -1) param_file.close() slow_down_cdf.launch(pa, pa.output_filename + '_' + str(iteration) + '.pkl', render=False, plot=True, repre=repre, end=end) plot_lr_curve(pa.output_filename, max_rew_lr_curve, mean_rew_lr_curve, slow_down_lr_curve, ref_discount_rews, ref_slow_down)
def launch(pa, pg_resume=None, render=False, repre='image', end='no_new_job'): task_dist = Task_Dist() workloads = task_dist.gen_seq_workload() # ---------------------------- print("Preparing for workers...") # ---------------------------- #logs = open('/home/shanka/logs_packing_deeprm', 'a') pg_learners = [] envs = [] job_distribution = Dist() nw_len_seqs, nw_size_seqs = job_distribution.generate_sequence_work( pa, seed=42) for ex in range(pa.num_ex): print("-prepare for env-", ex) env = environment.Env(pa, nw_len_seqs=nw_len_seqs, nw_size_seqs=nw_size_seqs, render=False, repre=repre, end=end) env.seq_no = ex envs.append(env) for ex in range(pa.batch_size + 1): # last worker for updating the parameters print("-prepare for worker-", ex) pg_learner = pg_network.PGLearner(pa) if pg_resume is not None: net_handle = open(pg_resume, 'rb') net_params = pickle.load(net_handle) pg_learner.set_net_params(net_params) pg_learners.append(pg_learner) accums = init_accums(pg_learners[pa.batch_size]) # -------------------------------------- # print("Preparing for reference data...") # -------------------------------------- # print('Start testing...') # for ite in range(10,1000,10): # pg_resume = pa.output_filename +'_'+str(ite)+'.pkl' # logline=test(ite,pa, pg_resume,workloads,repre) # logs.write(logline) # logs.flush() # os.fsync(logs.fileno()) # return # ref_discount_rews, ref_slow_down = slow_down_cdf.launch(pa, pg_resume=None, render=False, plot=False, repre=repre, end=end) # mean_rew_lr_curve = [] # max_rew_lr_curve = [] # slow_down_lr_curve = [] # -------------------------------------- print("Start training...") # -------------------------------------- timer_start = time.time() for iteration in range(1, pa.num_epochs): ps = [] # threads manager = Manager() # managing return results manager_result = manager.list([]) ex_indices = range(pa.num_ex) # np.random.shuffle(ex_indices) all_eprews = [] grads_all = [] loss_all = [] eprews = [] eplens = [] all_slowdown = [] all_entropy = [] ex_counter = 0 for ex in range(pa.num_ex): ex_idx = ex_indices[ex] p = Process(target=get_traj_worker, args=( pg_learners[ex_counter], envs[ex_idx], pa, manager_result, )) ps.append(p) ex_counter += 1 if ex_counter >= pa.batch_size or ex == pa.num_ex - 1: print(ex, "out of", pa.num_ex) ex_counter = 0 for p in ps: p.start() for p in ps: p.join() result = [] # convert list from shared memory for r in manager_result: result.append(r) ps = [] manager_result = manager.list([]) all_ob = concatenate_all_ob_across_examples( [r["all_ob"] for r in result], pa) all_action = np.concatenate([r["all_action"] for r in result]) all_adv = np.concatenate([r["all_adv"] for r in result]) # Do policy gradient update step, using the first agent # put the new parameter in the last 'worker', then propagate the update at the end grads = pg_learners[pa.batch_size].get_grad( all_ob, all_action, all_adv) grads_all.append(grads) all_eprews.extend([r["all_eprews"] for r in result]) eprews.extend(np.concatenate([r["all_eprews"] for r in result ])) # episode total rewards eplens.extend(np.concatenate([r["all_eplens"] for r in result ])) # episode lengths # all_slowdown.extend(np.concatenate([r["all_slowdown"] for r in result])) # all_entropy.extend(np.concatenate([r["all_entropy"] for r in result])) # assemble gradients grads = grads_all[0] for i in range(1, len(grads_all)): for j in range(len(grads)): grads[j] += grads_all[i][j] # propagate network parameters to others params = pg_learners[pa.batch_size].get_params() rmsprop_updates_outside(grads, params, accums, pa.lr_rate, pa.rms_rho, pa.rms_eps) for i in range(pa.batch_size + 1): pg_learners[i].set_net_params(params) timer_end = time.time() print("-----------------") print("Iteration: \t %i" % iteration) print("NumTrajs: \t %i" % len(eprews)) print("NumTimesteps: \t %i" % np.sum(eplens)) # print "Loss: \t %s" % np.mean(loss_all) print("MaxRew: \t %s" % np.average([np.max(rew) for rew in all_eprews])) print("MeanRew: \t %s +- %s" % (np.mean(eprews), np.std(eprews))) # print "MeanSlowdown: \t %s" % np.mean(all_slowdown) print("MeanLen: \t %s +- %s" % (np.mean(eplens), np.std(eplens))) # print "MeanEntropy \t %s" % (np.mean(all_entropy)) print("Elapsed time\t %s" % (timer_end - timer_start), "seconds") print("-----------------") # max_rew_lr_curve.append(np.average([np.max(rew) for rew in all_eprews])) # mean_rew_lr_curve.append(np.mean(eprews)) # slow_down_lr_curve.append(np.mean(all_slowdown)) if iteration % pa.output_freq == 0: pg_resume = pa.output_filename + '_' + str(iteration) + '.pkl' param_file = open(pg_resume, 'wb') pickle.dump(pg_learner.get_params(), param_file, -1) param_file.close() test(pa, pg_resume, workloads, repre)
def test(it, pa, pg_resume, workloads, episode_max_length=200): repre = 'image' end = 'all_done' agent = Heuristic_Agents() pg_learner = pg_network.PGLearner(pa) if pg_resume is not None: net_handle = open(pg_resume, 'rb') net_params = pickle.load(net_handle) pg_learner.set_net_params(net_params) new_env = Env1(0, 1) job_distribution = Dist() nw_len_seqs, nw_size_seqs = job_distribution.generate_sequence_work( pa, seed=42) logline = str(it) + '\n' for ex in range(pa.num_test_ex): env = environment.Env(pa, nw_len_seqs=nw_len_seqs, nw_size_seqs=nw_size_seqs, render=False, repre=repre, end=end) env.seq_no = ex + pa.num_ex new_env.reset() new_env.workload_seq = workloads[ex + pa.num_ex] new_env.generate_workload() print('Testing : ', new_env.workload_seq) env.reset() obs = [] new_obs = [] acts = [] new_acts = [] rews = [] utils = '' suffer = [] new_rews = [] entropy = [] finished_episode_len = 0 crs = [0] * pa.num_machines crs_max = [0] * pa.num_machines info = [] new_ob = new_env.observe() ob = env.observe() counter = 0 for _ in range(200): act_prob = pg_learner.get_one_act_prob(ob) a = np.argmax(act_prob) act = agent.get_action(new_env, a) new_obs.append(new_ob) new_acts.append(act) new_ob, new_rews, done1, info1 = new_env.step(act, _, new_rews, a) #a = (csprob_n > np.random.rand()).argmax() #np.set_printoptions(linewidth=40*5, precision = 2, threshold=np.nan) # print('State>>',ob) # print('Action>>',a) obs.append(ob) # store the ob at current decision making step acts.append(a) ob, rew, done, info = env.step(a, repeat=True) counter += 1 if info == 'Allocation_Success': finished_episode_len = _ + 1 # print('Reward>>',rew) rews.append(rew) entropy.append(get_entropy(act_prob)) if done1: break util = '' for k, machine in enumerate(new_env.machines): if len(machine.running_tasks) > 0: if machine.cpus_left >= 0: util += str(machine.total_cpus - machine.cpus_left) + ',' else: util += str(machine.total_cpus) + ',' suffer.append(abs(machine.cpus_left)) else: util += str(0) + ',' crs_this_time = [0] * pa.num_machines for i in range(len(machine.running_tasks)): for j in range(i + 1, len(machine.running_tasks)): task_i, task_j = machine.running_tasks[ i], machine.running_tasks[j] if task_i != task_j: crs[k] += pa.interference_penalty * ( task_i.cpu_util[-1] * task_j.cpu_util[-1]) * (-1) crs_this_time[k] += pa.interference_penalty * ( task_i.cpu_util[-1] * task_j.cpu_util[-1]) * (-1) crs_max[k] = max(crs_max[k], crs_this_time[k]) ################# utils += util + '|' logline += str( str(counter - 1) + '|' + str(utils) + str(finished_episode_len)) + '\n' + str( sum(new_rews)) + '\n' + str(sum(suffer)) + '\n' for i in range(len(new_env.machines)): logline += str(crs[i]) + ',' logline = logline[:-1] + '\n' for i in range(len(new_env.machines)): logline += str(crs_max[i]) + ',' logline = logline[:-1] logline += '\n' print('Iteration number ', it) print('Example No:,', ex) print('Test Actions : ', new_acts) print('Reward : ', new_rews) print('Total reward : ', sum(new_rews)) return logline
def launch(pa, pg_resume=None, render=True, repre='image', end='no_new_job'): f = open('log/re_log_' + datetime.now().strftime('%Y-%m-%d_%H:%M:%S'), 'a') env = environment.Env(pa, render=render, repre=repre, end=end) pg_learner = pg_network.PGLearner(pa) startIdx = 0 if pg_resume is not None: # and 're' in pg_resume: net_handle = open(pg_resume, 'rb') net_params = cPickle.load(net_handle) pg_learner.set_net_params(net_params) tmp = re.match('.+?(\d+).+', pg_resume) startIdx = int(tmp.group(1)) # ---------------------------- print("\nPreparing for data...") # ---------------------------- ref_discount_rews, ref_slow_down = slow_down_cdf.launch(pa, pg_resume=None, render=True, plot=False, repre=repre, end=end) mean_rew_lr_curve = [] max_rew_lr_curve = [] slow_down_lr_curve = [] timer_start = time.time() print("\nStart reinforcement learning...") for iteration in xrange(startIdx, pa.num_epochs): all_ob = [] all_action = [] all_adv = [] all_eprews = [] all_eplens = [] all_slowdown = [] all_entropy = [] # go through all examples for ex in xrange(pa.num_ex): # Collect trajectories until we get timesteps_per_batch total timesteps trajs = [] for i in xrange(pa.num_seq_per_batch): traj = get_traj(pg_learner, env, pa.episode_max_length) trajs.append(traj) # roll to next example env.seq_no = (env.seq_no + 1) % env.pa.num_ex all_ob.append(concatenate_all_ob(trajs, pa)) # Compute discounted sums of rewards rets = [discount(traj["reward"], pa.discount) for traj in trajs] maxlen = max(len(ret) for ret in rets) padded_rets = [ np.concatenate([ret, np.zeros(maxlen - len(ret))]) for ret in rets ] # Compute time-dependent baseline baseline = np.mean(padded_rets, axis=0) # Compute advantage function advs = [ret - baseline[:len(ret)] for ret in rets] all_action.append( np.concatenate([traj["action"] for traj in trajs])) all_adv.append(np.concatenate(advs)) all_eprews.append( np.array([ discount(traj["reward"], pa.discount)[0] for traj in trajs ])) # episode total rewards all_eplens.append(np.array([len(traj["reward"]) for traj in trajs])) # episode lengths # All Job Stat enter_time, finish_time, job_len = process_all_info(trajs) finished_idx = (finish_time >= 0) all_slowdown.append( (finish_time[finished_idx] - enter_time[finished_idx]) / job_len[finished_idx]) # Action prob entropy all_entropy.append(np.concatenate([traj["entropy"]])) all_ob = concatenate_all_ob_across_examples(all_ob, pa) all_action = np.concatenate(all_action) all_adv = np.concatenate(all_adv) # Do policy gradient update step loss = pg_learner.train(all_ob, all_action, all_adv) eprews = np.concatenate(all_eprews) # episode total rewards eplens = np.concatenate(all_eplens) # episode lengths all_slowdown = np.concatenate(all_slowdown) all_entropy = np.concatenate(all_entropy) timer_end = time.time() print "-----------------" print "Iteration: \t %i" % iteration print "NumTrajs: \t %i" % len(eprews) print "NumTimesteps: \t %i" % np.sum(eplens) print "Loss: \t %s" % loss print "MaxRew: \t %s" % np.average([np.max(rew) for rew in all_eprews]) print "MeanRew: \t %s +- %s" % (eprews.mean(), eprews.std()) print "MeanSlowdown: \t %s" % np.mean(all_slowdown) print "MeanLen: \t %s +- %s" % (eplens.mean(), eplens.std()) print "MeanEntropy \t %s" % (np.mean(all_entropy)) print "Elapsed time\t %s" % (timer_end - timer_start), "seconds" print "-----------------" f.write("-----------------\n") f.write("Iteration: \t %i\n" % (iteration)) f.write("NumTrajs: \t %i\n" % (len(eprews))) f.write("NumTimesteps: \t %i\n" % (np.sum(eplens))) f.write("Loss: \t %s\n".format(loss)) f.write("MaxRew: \t %s\n" % (np.average([np.max(rew) for rew in all_eprews]))) f.write("MeanRew: \t %s +- %s\n" % (np.mean(eprews), np.std(eprews))) f.write("MeanSlowdown: \t %s\n" % (np.mean(all_slowdown))) f.write("MeanLen: \t %s +- %s\n" % (np.mean(eplens), np.std(eplens))) f.write("MeanEntropy \t %s\n" % ((np.mean(all_entropy)))) f.write("Elapsed time\t %s seconds\n" % ((timer_end - timer_start))) f.write("-----------------\n") f.close() timer_start = time.time() max_rew_lr_curve.append(np.average([np.max(rew) for rew in all_eprews])) mean_rew_lr_curve.append(eprews.mean()) slow_down_lr_curve.append(np.mean(all_slowdown)) if iteration % pa.output_freq == 0: param_file = open( pa.output_filename + '_' + str(iteration) + '.pkl', 'wb') cPickle.dump(pg_learner.get_params(), param_file, -1) param_file.close() # added by wjchen, to record accuracy and rewards sample_file = h5py.File('log/re_record_iter'+str(len(slow_down_lr_curve))\ + datetime.now().strftime('%Y-%m-%d_%H:%M')+'.h5', 'w+') sample_file.create_dataset('max_rew_lr_curve', data=max_rew_lr_curve) sample_file.create_dataset('mean_rew_lr_curve', data=mean_rew_lr_curve) sample_file.create_dataset('slow_down_lr_curve', data=slow_down_lr_curve) # ref_dr = sample_file.create_group('ref_discount_rews') for k, v in ref_discount_rews.items(): ref_dr[k] = np.average(v) # ref_sd = sample_file.create_group('ref_slow_down') for k, v in ref_slow_down.items(): ref_sd[k] = np.average(np.concatenate(v)) sample_file.close() slow_down_cdf.launch(pa, pa.output_filename + '_' + str(iteration) + '.pkl', render=True, plot=True, repre=repre, end=end) plot_lr_curve(pa.output_filename, max_rew_lr_curve, mean_rew_lr_curve, slow_down_lr_curve, ref_discount_rews, ref_slow_down)
def launch(pa, pg_resume=None, render=False, repre='image', end='no_new_job'): # ---------------------------- print("Preparing for workers...") # ---------------------------- pg_learners = [] envs = [] plot_maker = plot.PlotMaker(pa) for ex in range(pa.num_ex): env = environment.Env(pa, render=False, repre=repre, end=end) env.seq_no = ex envs.append(env) # for ex in range(pa.batch_size + 1): # last worker for updating the parameters # # print "-prepare for worker-", ex # pg_learner = pg_network.PGLearner(pa) # # if pg_resume is not None: # net_handle = open(pg_resume, 'rb') # net_params = cPickle.load(net_handle) # pg_learner.set_net_params(net_params) # pg_learners.append(pg_learner) # accums = init_accums(pg_learners[pa.batch_size]) # -------------------------------------- print("Preparing for reference data...") # -------------------------------------- # ref_discount_rews, ref_slow_down = slow_down_cdf.launch(pa, pg_resume=None, render=False, plot=False, repre=repre, end=end) # mean_rew_lr_curve = [] # max_rew_lr_curve = [] # slow_down_lr_curve = [] # -------------------------------------- print("Start training...") # -------------------------------------- for iteration in range(1, pa.num_epochs): timer_start = time.time() with open('somefile.txt', 'a') as the_file: the_file.write("----------------Iteration %d----------------\n" % iteration) ps = [] # threads manager = Manager() # managing return results manager_result = manager.list([]) ex_indices = np.arange(pa.num_ex) np.random.shuffle(ex_indices) all_eprews = [] grads_all = [] loss_all = [] eprews = [] eplens = [] all_slowdown = [] all_entropy = [] ex_counter = 0 for ex in range(pa.num_ex): # print(ex) ex_idx = ex_indices[ex] # p = Process(target=get_traj, # args=(pg_learners[ex_counter], envs[ex_idx], pa.episode_max_length, manager_result,)) # # ps.append(p) # ex_counter += 1 # # if ex_counter >= pa.batch_size or ex == pa.num_ex - 1: # # ex_counter = 0 # for p in ps: # p.start() # # for p in ps: # p.join() result = [] # convert list from shared memory ps = [] result = [] get_traj_worker( pg_learners[ex_counter], envs[ex_idx], pa, manager_result, ) for r in manager_result: result.append(r) # print(len(result)) manager_result = manager.list([]) # print("ok") all_ob = concatenate_all_ob_across_examples( [r["all_ob"] for r in result], pa) all_action = np.concatenate([r["all_action"] for r in result]) all_machine = np.concatenate([r["all_machine"] for r in result]) all_adv = np.concatenate([r["all_adv"] for r in result]) pg_learners[0].fit(all_ob, all_action, all_machine, all_adv) all_eprews.extend([r["all_eprews"] for r in result]) # print(all_eprews) eprews.extend(np.concatenate([r["all_eprews"] for r in result ])) # episode total rewards eplens.extend(np.concatenate([r["all_eplens"] for r in result])) # episode lengths all_slowdown.extend( np.concatenate([r["all_slowdown"] for r in result])) all_entropy.extend( np.concatenate([r["all_entropy"] for r in result])) #train the first agent timer_end = time.time() # print(len(all_slowdown)) # print(all_slowdown) # MARK: changed slowdown_all_in_one = np.concatenate(all_slowdown) print(slowdown_all_in_one.shape) print("-----------------") print("Iteration: \t %i" % iteration) print("NumTrajs: \t %i" % len(eprews)) print("NumTimesteps: \t %i" % np.sum(eplens)) print("Loss: \t %s" % np.mean(loss_all)) print("MaxRew: \t %s" % np.average([np.max(rew) for rew in all_eprews])) print("MeanRew: \t %s +- %s" % (np.mean(eprews), np.std(eprews))) print("MeanSlowdown: \t %s" % np.mean(slowdown_all_in_one)) print("MeanLen: \t %s +- %s" % (np.mean(eplens), np.std(eplens))) print("MeanEntropy \t %s" % (np.mean(all_entropy))) print("Elapsed time\t %s" % (timer_end - timer_start), "seconds") print("-----------------") plot_maker.slow_down_records.append(all_slowdown) with open('somefile.txt', 'a') as the_file: the_file.write("MeanRew: \t %s +- %s\n" % (np.mean(eprews), np.std(eprews))) the_file.write("MeanSlowdown: \t %s\n-----------------\n\n" % np.mean(slowdown_all_in_one)) #TODO: set paramaetes for other agents # for i in xrange(pa.batch_size + 1): # pg_learners[i].set_net_params(params) timer_end = time.time() # print "-----------------" # print "Iteration: \t %i" % iteration # print "NumTrajs: \t %i" % len(eprews) # print "NumTimesteps: \t %i" % np.sum(eplens) # # print "Loss: \t %s" % np.mean(loss_all) # print "MaxRew: \t %s" % np.average([np.max(rew) for rew in all_eprews]) # print "MeanRew: \t %s +- %s" % (np.mean(eprews), np.std(eprews)) # print "MeanSlowdown: \t %s" % np.mean(all_slowdown) # print "MeanLen: \t %s +- %s" % (np.mean(eplens), np.std(eplens)) # print "MeanEntropy \t %s" % (np.mean(all_entropy)) # print "Elapsed time\t %s" % (timer_end - timer_start), "seconds" # print "-----------------" # # timer_start = time.time() # # max_rew_lr_curve.append(np.average([np.max(rew) for rew in all_eprews])) # mean_rew_lr_curve.append(np.mean(eprews)) # slow_down_lr_curve.append(np.mean(all_slowdown)) # # if iteration % pa.output_freq == 0: # param_file = open(pa.output_filename + '_' + str(iteration) + '.pkl', 'wb') # cPickle.dump(pg_learners[pa.batch_size].get_params(), param_file, -1) # param_file.close() # # pa.unseen = True # slow_down_cdf.launch(pa, pa.output_filename + '_' + str(iteration) + '.pkl', # render=False, plot=True, repre=repre, end=end) # pa.unseen = False # # test on unseen examples # # plot_lr_curve(pa.output_filename, # max_rew_lr_curve, mean_rew_lr_curve, slow_down_lr_curve, # ref_discount_rews, ref_slow_down) plot_maker.plot()
def main(): pa = parameters.Parameters() type_exp = 'pg_re' # 'pg_su' 'pg_su_compact' 'v_su', 'pg_v_re', 'pg_re', q_re', 'test' pg_resume = None v_resume = None q_resume = None log = None render = False plot = False try: opts, args = getopt.getopt(sys.argv[1:], "hi:o:", [ "exp_type=", "num_res=", "num_nw=", "simu_len=", "num_ex=", "num_seq_per_batch=", "eps_max_len=", "num_epochs=", "time_horizon=", "res_slot=", "max_job_len=", "max_job_size=", "new_job_rate=", "dist=", "lr_rate=", "ba_size=", "pg_re=", "v_re=", "q_re=", "out_freq=", "ofile=", "log=", "render=", "unseen=", "plot=" ]) except getopt.GetoptError: script_usage() sys.exit(2) for opt, arg in opts: if opt == '-h': script_usage() sys.exit() elif opt in ("-e", "--exp_type"): type_exp = arg elif opt in ("-n", "--num_res"): pa.num_resources = int(arg) elif opt in ("-w", "--num_nw"): pa.num_nw = int(arg) elif opt in ("-s", "--simu_len"): pa.simu_len = int(arg) elif opt in ("-n", "--num_ex"): pa.num_ex = int(arg) elif opt in ("-sp", "--num_seq_per_batch"): pa.num_seq_per_batch = int(arg) elif opt in ("-el", "--eps_max_len"): pa.episode_max_length = int(arg) elif opt in ("-ne", "--num_epochs"): pa.num_epochs = int(arg) elif opt in ("-t", "--time_horizon"): pa.time_horizon = int(arg) elif opt in ("-rs", "--res_slot"): pa.res_slot = int(arg) elif opt in ("-ml", "--max_job_len"): pa.max_job_len = int(arg) elif opt in ("-ms", "--max_job_size"): pa.max_job_size = int(arg) elif opt in ("-nr", "--new_job_rate"): pa.new_job_rate = float(arg) elif opt in ("-d", "--dist"): pa.discount = float(arg) elif opt in ("-l", "--lr_rate"): pa.lr_rate = float(arg) elif opt in ("-b", "--ba_size"): pa.batch_size = int(arg) elif opt in ("-p", "--pg_re"): pg_resume = arg elif opt in ("-v", "--v_re"): v_resume = arg elif opt in ("-q", "--q_re"): q_resume = arg elif opt in ("-f", "--out_freq"): pa.output_freq = int(arg) elif opt in ("-o", "--ofile"): pa.output_filename = arg elif opt in ("-lg", "--log"): log = arg elif opt in ("-r", "--render"): render = (arg == 'True') elif opt in ("-pl", "--plot"): plot = (arg == 'True') elif opt in ("-u", "--unseen"): pa.generate_unseen = (arg == 'True') else: script_usage() sys.exit() if log is not None: orig_stdout = sys.stdout f = open(log, 'w') sys.stdout = f if pg_resume is None: print("PG resume is empty!") sys.exit(1) pa.compute_dependent_parameters() repre = 'image' end = 'all_done' env = environment.Env(pa, render=render, repre=repre, end=end) pg_learner = pg_network.PGLearner(pa) net_handle = open(pg_resume, 'rb') net_params = cPickle.load(net_handle) pg_learner.set_net_params(net_params) outputFileName = pa.output_filename + '_' + \ ntpath.basename(pg_resume) + '_test.pkl' pg_learner.write_net_to_nnet(outputFileName) nnetFilename = outputFileName + '.nnet' r = nnet.NNet(nnetFilename) smallHigherBound = 1.0e-10 smallLowerBound = -1.0e-10 for wIdx, w in enumerate(r.weights): smallRows = np.all((w <= smallHigherBound) & (w >= smallLowerBound), axis=1) smallRowsIndices = np.where(smallRows == True) for row in smallRowsIndices[0]: rowBias = r.biases[wIdx][row] #add it's bias to all biasses of next layer since we assume fully connected if ((wIdx + 1) < len(r.biases)): r.biases[wIdx + 1] = r.biases[wIdx + 1] + rowBias #Now we delete the lines with 'dead' neurons r.weights[wIdx] = np.delete(w, smallRowsIndices[0], axis=0) if ((wIdx + 1) < len(r.weights)): r.weights[wIdx + 1] = np.delete(r.weights[wIdx + 1], smallRowsIndices[0], axis=1) r.biases[wIdx] = np.delete(r.biases[wIdx], smallRowsIndices[0], axis=0) # now we export the file once again after the 'dead' neuron filtration for wIdx, w in enumerate(r.weights): r.weights[wIdx] = r.weights[wIdx].transpose() writeNNet.writeNNet(r.weights, r.biases, r.mins, r.maxes, r.means, r.ranges, outputFileName + '_cleaned_' + '.nnet') if log is not None: sys.stdout = orig_stdout f.close()
def launch(pa, pg_resume=None, render=False, repre='image', end='no_new_job'): # ---------------------------- print("Preparing for workers...") # ---------------------------- pg_learners = [] envs = [] nw_len_seqs, nw_size_seqs = job_distribution.generate_sequence_work( pa, seed=42) for ex in xrange(pa.num_ex): # number of sequences print "-prepare for env-", ex env = environment.Env(pa, nw_len_seqs=nw_len_seqs, nw_size_seqs=nw_size_seqs, render=True, repre=repre, end=end) env.seq_no = ex envs.append(env) for ex in xrange(pa.batch_size + 1): # last worker for updating the parameters print "-prepare for worker-", ex pg_learner = pg_network.PGLearner(pa) startIndex = 0 if pg_resume is not None: net_handle = open(pg_resume, 'rb') net_params = cPickle.load(net_handle) pg_learner.set_net_params(net_params) startIndex = re.match(pg_resume, '\d+').group() startIndex = int(startIndex) pg_learners.append(pg_learner) accums = init_accums(pg_learners[pa.batch_size]) # -------------------------------------- print("Preparing for reference data...") # -------------------------------------- # Reference examples, get reference discounted rewards and reference slowdown from random, SJF and Tetris algorithms ref_discount_rews, ref_slow_down = slow_down_cdf.launch(pa, pg_resume=None, render=True, plot=False, repre=repre, end=end) mean_rew_lr_curve = [] max_rew_lr_curve = [] slow_down_lr_curve = [] # -------------------------------------- print("Start training...") # -------------------------------------- timer_start = time.time() for iteration in xrange(startIndex, pa.num_epochs): ps = [] # threads manager = Manager() # managing return results manager_result = manager.list([]) ex_indices = range(pa.num_ex) np.random.shuffle(ex_indices) all_eprews = [] grads_all = [] loss_all = [] eprews = [] eplens = [] all_slowdown = [] all_entropy = [] ex_counter = 0 for ex in xrange(pa.num_ex): ex_idx = ex_indices[ex] p = Process(target=get_traj_worker, args=( pg_learners[ex_counter], envs[ex_idx], pa, manager_result, )) ps.append(p) ex_counter += 1 # append pa.num_ex number of Processes in ps until going inside if if ex_counter >= pa.batch_size or ex == pa.num_ex - 1: print ex + 1, "out of", pa.num_ex ex_counter = 0 for p in ps: p.start() for p in ps: p.join() result = [] # convert list from shared memory for r in manager_result: result.append(r) ps = [] manager_result = manager.list([]) all_ob = concatenate_all_ob_across_examples( [r["all_ob"] for r in result], pa) all_action = np.concatenate([r["all_action"] for r in result]) all_adv = np.concatenate([r["all_adv"] for r in result]) # Do policy gradient update step, using the first agent # put the new parameter in the last 'worker', then propagate the update at the end grads = pg_learners[pa.batch_size].get_grad( all_ob, all_action, all_adv) #(states, actions, values) grads_all.append(grads) all_eprews.extend([r["all_eprews"] for r in result]) eprews.extend(np.concatenate([r["all_eprews"] for r in result ])) # episode total rewards eplens.extend(np.concatenate([r["all_eplens"] for r in result ])) # episode lengths all_slowdown.extend( np.concatenate([r["all_slowdown"] for r in result])) all_entropy.extend( np.concatenate([r["all_entropy"] for r in result])) # assemble gradients grads = grads_all[0] for i in xrange(1, len(grads_all)): for j in xrange(len(grads)): grads[j] += grads_all[i][j] # propagate network parameters to others params = pg_learners[pa.batch_size].get_params() rmsprop_updates_outside(grads, params, accums, pa.lr_rate, pa.rms_rho, pa.rms_eps) for i in xrange(pa.batch_size + 1): pg_learners[i].set_net_params(params) timer_end = time.time() print "-----------------" print "Iteration: \t %i" % iteration print "NumTrajs: \t %i" % len(eprews) print "NumTimesteps: \t %i" % np.sum(eplens) # print "Loss: \t %s" % np.mean(loss_all) print "MaxRew: \t %s" % np.average([np.max(rew) for rew in all_eprews]) print "MeanRew: \t %s +- %s" % (np.mean(eprews), np.std(eprews)) print "MeanSlowdown: \t %s" % np.mean(all_slowdown) print "MeanLen: \t %s +- %s" % (np.mean(eplens), np.std(eplens)) print "MeanEntropy \t %s" % (np.mean(all_entropy)) print "Elapsed time\t %s" % (timer_end - timer_start), "seconds" print "-----------------" f = open('log/re_log_' + datetime.now().strftime('%Y-%m-%d_%H:%M:%S'), 'w+') f.write("-----------------\n") f.write("Iteration: \t %i\n" % (iteration)) f.write("NumTrajs: \t %i\n" % (len(eprews))) f.write("NumTimesteps: \t %i\n" % (np.sum(eplens))) # f.write("Loss: \t %s\n".format(loss)) f.write("MaxRew: \t %s\n" % (np.average([np.max(rew) for rew in all_eprews]))) f.write("MeanRew: \t %s +- %s\n" % (np.mean(eprews), np.std(eprews))) f.write("MeanSlowdown: \t %s\n" % (np.mean(all_slowdown))) f.write("MeanLen: \t %s +- %s\n" % (np.mean(eplens), np.std(eplens))) f.write("MeanEntropy \t %s\n" % ((np.mean(all_entropy)))) f.write("Elapsed time\t %s seconds\n" % ((timer_end - timer_start))) f.write("-----------------\n") f.close() timer_start = time.time() max_rew_lr_curve.append(np.average([np.max(rew) for rew in all_eprews])) mean_rew_lr_curve.append(np.mean(eprews)) slow_down_lr_curve.append(np.mean(all_slowdown)) if iteration % pa.output_freq == 0: param_file = open( pa.output_filename + '_' + str(iteration) + '.pkl', 'wb') cPickle.dump(pg_learners[pa.batch_size].get_params(), param_file, -1) param_file.close() # added by wjchen, to record accuracy and rewards sample_file = h5py.File('log/re_record'+str(len(slow_down_lr_curve))\ + datetime.now().strftime('%Y-%m-%d_%H:%M')+'.h5', 'w') sample_file.create_dataset('max_rew_lr_curve', data=max_rew_lr_curve) sample_file.create_dataset('mean_rew_lr_curve', data=mean_rew_lr_curve) sample_file.create_dataset('slow_down_lr_curve', data=slow_down_lr_curve) ref_dr = sample_file.create_group('ref_discount_rews') for k, v in ref_discount_rews.items(): ref_dr[k] = np.average(v) ref_sd = sample_file.create_group('ref_slow_down') for k, v in ref_slow_down.items(): ref_sd[k] = np.average(np.concatenate(v)) sample_file.close() # print ref_slow_down # print ref_discount_rews # print '\n----Reference Slowdown----' for k, v in ref_slow_down.items(): print "{}: {}".format(k, np.average(np.concatenate(v))) print '\n----Reference Discount Reward----' for k, v in ref_discount_rews.items(): print "{}: {}".format(k, np.average(v)) pa.unseen = True slow_down_cdf.launch(pa, pa.output_filename + '_' + str(iteration) + '.pkl', render=True, plot=True, repre=repre, end=end) pa.unseen = False # test on unseen examples plot_lr_curve(pa.output_filename, max_rew_lr_curve, mean_rew_lr_curve, slow_down_lr_curve, ref_discount_rews, ref_slow_down ) # draw average of ref_discount_rews, ref_slow_down
def get_traj_halluc(test_type, pa, env, episode_max_length, pg_resume=None, render=False): """ Run agent-environment loop for one whole episode (trajectory) Return dictionary of results """ if test_type == 'PG': # load trained parameters pg_learner = pg_network.PGLearner(pa) net_handle = open(pg_resume, 'rb') net_params = cPickle.load(net_handle) pg_learner.set_net_params(net_params) env.reset() rews = [] rnn_tmp = env.rnn for te in xrange(episode_max_length): env.rnn = None ori_env = copy.deepcopy(env) actions = [] future = min(episode_max_length - te, pa.simu_len) rews_hals = np.zeros((pa.num_hal, future), dtype=float) if pa.rnn: rnn_tmp.forecast_from_history() for h in range(pa.num_hal): new_env = copy.deepcopy(ori_env) new_env.rnn = rnn_tmp if pa.rnn: new_env.replace_backlog_from_rnn() ob = new_env.observe() for th in range(future): if test_type == 'PG': a = pg_learner.choose_action(ob) elif test_type == 'Tetris': a = other_agents.get_packer_action(new_env.machine, new_env.job_slot) elif test_type == 'SJF': a = other_agents.get_sjf_action(new_env.machine, new_env.job_slot) elif test_type == 'Random': a = other_agents.get_random_action(new_env.job_slot) if th == 0: actions.append(a) ob, rew, done, info = new_env.step( a, repeat=True, forecasting=(pa.rnn == True)) if done: break rews_hals[h][th] = rew sum_rews = rews_hals.sum(axis=1, dtype=float) a_best = actions[np.argmax(sum_rews)] working_env = copy.deepcopy(ori_env) working_env.rnn = rnn_tmp if pa.rnn: ob, rew, done, info, new_job_list = working_env.step( a_best, repeat=True, return_raw_jobs=True) for new_job in new_job_list: working_env.rnn.update_history(new_job) else: ob, rew, done, info = working_env.step(a_best, repeat=True) rews.append(rew) if done: break if render: working_env.render() # env.render() env.rnn = rnn_tmp return np.array(rews), info