Beispiel #1
0
def get_traj(test_type, pa, env, episode_max_length, pg_resume=None, render=False):
    """
    Run agent-environment loop for one whole episode (trajectory)
    Return dictionary of results
    """

    if test_type == 'PG':  # load trained parameters

        pg_learner = pg_network.PGLearner(pa)

        net_handle = open(pg_resume, 'rb')
        net_params = cPickle.load(net_handle)
        pg_learner.set_net_params(net_params)

    env.reset()
    rews = []

    ob = env.observe()

    for _ in xrange(episode_max_length):

        if test_type == 'PG':
            a = pg_learner.choose_action(ob)

        elif test_type == 'Tetris':
            a = other_agents.get_packer_action(env.machines, env.job_slot)

        elif test_type == 'SJF':
            a = other_agents.get_sjf_action(env.machines, env.job_slot)

        elif test_type == 'Random':
            a = other_agents.get_random_action(env.machines, env.job_slot)

        elif test_type == 'SJF2':
            a = other_agents.get_sjf_action_for_multiple_machines(env.machines, env.job_slot)

        elif test_type == 'Packer2':
            a = other_agents.get_packer_action_for_multiple_machines(env.machines, env.job_slot)

        elif test_type == 'Tetris2':
            a = other_agents.get_packer_sjf_action_for_multiple_machines(env.machines, env.job_slot, 0.3)

        elif test_type == 'Random2':
            a = other_agents.get_random_action_for_multiple_machines(env.machines, env.job_slot)


        ob, rew, done, info = env.step(a, repeat=True)

        rews.append(rew)

        if done: break
        if render: env.render()
        # env.render()

    return np.array(rews), info
Beispiel #2
0
def get_traj(test_type,
             pa,
             env,
             episode_max_length,
             pg_resume=None,
             render=False,
             q_agent=None):
    """
    Run agent-environment loop for one whole episode (trajectory)
    Return dictionary of results
    """

    if test_type == 'PG':  # load trained parameters

        pg_learner = pg_network.PGLearner(pa)

        net_handle = open(pg_resume, 'rb')
        net_params = cPickle.load(net_handle)
        pg_learner.set_net_params(net_params)

    # Q network
    elif test_type == 'Q':
        assert (q_agent is not None)

    env.reset()
    rews = []

    ob = env.observe()

    for _ in xrange(episode_max_length):

        if test_type == 'PG':
            a = pg_learner.choose_action(ob)

        elif test_type == 'Random':
            a = other_agents.get_random_action(env.machine)

        elif test_type == 'LLQ':
            a = other_agents.get_llq_action(env.machine)

        elif test_type == 'Q':
            state = np.array(list(np.array(ob).flat))
            a = q_agent.greedy_policy(state[np.newaxis, :])

        ob, rew, done, info = env.step(a, repeat=True)
        rews.append(rew)

        if done: break
        if render: env.render()
        # env.render()

    return np.array(rews), info
Beispiel #3
0
def launch(pa, pg_resume=None, render=False, repre='image', end='no_new_job'):

    # ----------------------------
    print("Preparing for workers...")
    # ----------------------------

    pg_learners = []
    envs = []

    nw_len_seqs, nw_size_seqs = job_distribution.generate_sequence_work(
        pa, seed=42)

    ### create sequence of environments for each of the num_ex job sets/sequences
    for ex in xrange(pa.num_ex):

        print "-prepare for env-", ex

        env = environment.Env(pa,
                              nw_len_seqs=nw_len_seqs,
                              nw_size_seqs=nw_size_seqs,
                              render=False,
                              repre=repre,
                              end=end)
        env.seq_no = ex
        envs.append(env)

    ### generate sequence of NNs for each batch, each of which is a a policy gradient agent
    for ex in xrange(pa.batch_size +
                     1):  # last worker for updating the parameters

        print "-prepare for worker-", ex

        pg_learner = pg_network.PGLearner(pa)

        if pg_resume is not None:
            net_handle = open(pg_resume, 'rb')
            net_params = cPickle.load(net_handle)
            pg_learner.set_net_params(net_params)

        pg_learners.append(pg_learner)

    accums = init_accums(pg_learners[pa.batch_size])

    # --------------------------------------
    print("Preparing for reference data...")
    # --------------------------------------

    ref_discount_rews, ref_slow_down = slow_down_cdf.launch(pa,
                                                            pg_resume=None,
                                                            render=False,
                                                            plot=False,
                                                            repre=repre,
                                                            end=end)
    mean_rew_lr_curve = []
    max_rew_lr_curve = []
    slow_down_lr_curve = []

    # --------------------------------------
    print("Start training...")
    # --------------------------------------

    timer_start = time.time()

    for iteration in xrange(1, pa.num_epochs):

        ### use a thread for each use manager to share results across threads
        ps = []  # threads
        manager = Manager()  # managing return results
        manager_result = manager.list([])

        ex_indices = range(pa.num_ex)
        np.random.shuffle(ex_indices)

        all_eprews = []
        grads_all = []
        loss_all = []
        eprews = []
        eplens = []
        all_slowdown = []
        all_entropy = []

        ex_counter = 0

        ### for each jobset
        for ex in xrange(pa.num_ex):

            ex_idx = ex_indices[ex]
            ### evaluate several instances of trajectories for set of PG agents
            p = Process(target=get_traj_worker,
                        args=(
                            pg_learners[ex_counter],
                            envs[ex_idx],
                            pa,
                            manager_result,
                        ))
            ps.append(p)

            ex_counter += 1

            ##

            if ex_counter >= pa.batch_size or ex == pa.num_ex - 1:

                print ex, "out of", pa.num_ex

                ex_counter = 0

                for p in ps:
                    p.start()

                for p in ps:
                    p.join()

                result = []  # convert list from shared memory
                for r in manager_result:
                    result.append(r)

                ps = []
                manager_result = manager.list([])

                all_ob = concatenate_all_ob_across_examples(
                    [r["all_ob"] for r in result], pa)
                all_action = np.concatenate([r["all_action"] for r in result])
                all_adv = np.concatenate([r["all_adv"] for r in result])

                # Do policy gradient update step, using the first agent
                # put the new parameter in the last 'worker', then propagate the update at the end
                grads = pg_learners[pa.batch_size].get_grad(
                    all_ob, all_action, all_adv)

                grads_all.append(grads)

                all_eprews.extend([r["all_eprews"] for r in result])

                eprews.extend(np.concatenate([r["all_eprews"] for r in result
                                              ]))  # episode total rewards
                eplens.extend(np.concatenate([r["all_eplens"] for r in result
                                              ]))  # episode lengths

                all_slowdown.extend(
                    np.concatenate([r["all_slowdown"] for r in result]))
                all_entropy.extend(
                    np.concatenate([r["all_entropy"] for r in result]))

        # assemble gradients
        grads = grads_all[0]
        for i in xrange(1, len(grads_all)):
            for j in xrange(len(grads)):
                grads[j] += grads_all[i][j]

        # propagate network parameters to others
        params = pg_learners[pa.batch_size].get_params()

        rmsprop_updates_outside(grads, params, accums, pa.lr_rate, pa.rms_rho,
                                pa.rms_eps)

        for i in xrange(pa.batch_size + 1):
            pg_learners[i].set_net_params(params)

        timer_end = time.time()

        print "-----------------"
        print "Iteration: \t %i" % iteration
        print "NumTrajs: \t %i" % len(eprews)
        print "NumTimesteps: \t %i" % np.sum(eplens)
        # print "Loss:     \t %s" % np.mean(loss_all)
        print "MaxRew: \t %s" % np.average([np.max(rew) for rew in all_eprews])
        print "MeanRew: \t %s +- %s" % (np.mean(eprews), np.std(eprews))
        print "MeanSlowdown: \t %s" % np.mean(all_slowdown)
        print "MeanLen: \t %s +- %s" % (np.mean(eplens), np.std(eplens))
        print "MeanEntropy \t %s" % (np.mean(all_entropy))
        print "Elapsed time\t %s" % (timer_end - timer_start), "seconds"
        print "-----------------"

        timer_start = time.time()

        max_rew_lr_curve.append(np.average([np.max(rew)
                                            for rew in all_eprews]))
        mean_rew_lr_curve.append(np.mean(eprews))
        slow_down_lr_curve.append(np.mean(all_slowdown))

        if iteration % pa.output_freq == 0:
            param_file = open(
                pa.output_filename + '_' + str(iteration) + '.pkl', 'wb')
            cPickle.dump(pg_learners[pa.batch_size].get_params(), param_file,
                         -1)
            param_file.close()

            pa.unseen = True
            slow_down_cdf.launch(pa,
                                 pa.output_filename + '_' + str(iteration) +
                                 '.pkl',
                                 render=False,
                                 plot=True,
                                 repre=repre,
                                 end=end)
            pa.unseen = False
            # test on unseen examples

            plot_lr_curve(pa.output_filename, max_rew_lr_curve,
                          mean_rew_lr_curve, slow_down_lr_curve,
                          ref_discount_rews, ref_slow_down)
Beispiel #4
0
def launch(pa, pg_resume=None, render=False, repre='image', end='no_new_job'):

    env = environment.Env(pa, render=False, repre=repre, end=end)

    pg_learner = pg_network.PGLearner(pa)

    if pg_resume is not None:
        net_handle = open(pg_resume, 'r')
        net_params = cPickle.load(net_handle)
        pg_learner.set_net_params(net_params)

    if pa.evaluate_policy_name == "SJF":
        evaluate_policy = other_agents.get_sjf_action
    elif pa.evaluate_policy_name == "PACKER":
        evaluate_policy = other_agents.get_packer_action
    else:
        print("Panic: no policy known to evaluate.")
        exit(1)

    # ----------------------------
    print("Preparing for data...")
    # ----------------------------

    nw_len_seqs, nw_size_seqs = job_distribution.generate_sequence_work(
        pa, seed=42)

    # print 'nw_time_seqs=', nw_len_seqs
    # print 'nw_size_seqs=', nw_size_seqs

    mem_alloc = 4

    X = np.zeros([
        pa.simu_len * pa.num_ex * mem_alloc, 1, pa.network_input_height,
        pa.network_input_width
    ],
                 dtype=theano.config.floatX)
    y = np.zeros(pa.simu_len * pa.num_ex * mem_alloc, dtype='int32')

    print 'network_input_height=', pa.network_input_height
    print 'network_input_width=', pa.network_input_width

    counter = 0

    for train_ex in range(pa.num_ex):

        env.reset()

        for _ in xrange(pa.episode_max_length):

            # ---- get current state ----
            ob = env.observe()

            a = evaluate_policy(env.machine, env.job_slot)

            if counter < pa.simu_len * pa.num_ex * mem_alloc:

                add_sample(X, y, counter, ob, a)
                counter += 1

            ob, rew, done, info = env.step(a, repeat=True)

            if done:  # hit void action, exit
                break

        # roll to next example
        env.seq_no = (env.seq_no + 1) % env.pa.num_ex

    num_train = int(0.8 * counter)
    num_test = int(0.2 * counter)

    X_train, X_test = X[:num_train], X[num_train:num_train + num_test]
    y_train, y_test = y[:num_train], y[num_train:num_train + num_test]

    # Normalization, make sure nothing becomes NaN

    # X_mean = np.average(X[:num_train + num_test], axis=0)
    # X_std = np.std(X[:num_train + num_test], axis=0)
    #
    # X_train = (X_train - X_mean) / X_std
    # X_test = (X_test - X_mean) / X_std

    # ----------------------------
    print("Start training...")
    # ----------------------------

    for epoch in xrange(pa.num_epochs):

        # In each epoch, we do a full pass over the training data:
        train_err = 0
        train_acc = 0
        train_batches = 0
        start_time = time.time()
        for batch in iterate_minibatches(X_train,
                                         y_train,
                                         pa.batch_size,
                                         shuffle=True):
            inputs, targets = batch
            err, prob_act = pg_learner.su_train(inputs, targets)
            pg_act = np.argmax(prob_act, axis=1)
            train_err += err
            train_acc += np.sum(pg_act == targets)
            train_batches += 1

        # # And a full pass over the test data:
        test_err = 0
        test_acc = 0
        test_batches = 0
        for batch in iterate_minibatches(X_test,
                                         y_test,
                                         pa.batch_size,
                                         shuffle=False):
            inputs, targets = batch
            err, prob_act = pg_learner.su_test(inputs, targets)
            pg_act = np.argmax(prob_act, axis=1)
            test_err += err
            test_acc += np.sum(pg_act == targets)
            test_batches += 1

        # Then we print the results for this epoch:
        print("Epoch {} of {} took {:.3f}s".format(epoch + 1, pa.num_epochs,
                                                   time.time() - start_time))
        print("  training loss:    \t\t{:.6f}".format(train_err /
                                                      train_batches))
        print("  training accuracy:\t\t{:.2f} %".format(
            train_acc / float(num_train) * 100))
        print("  test loss:        \t\t{:.6f}".format(test_err / test_batches))
        print("  test accuracy:    \t\t{:.2f} %".format(test_acc /
                                                        float(num_test) * 100))

        sys.stdout.flush()

        if epoch % pa.output_freq == 0:

            net_file = open(
                pa.output_filename + '_net_file_' + str(epoch) + '.pkl', 'wb')
            cPickle.dump(pg_learner.return_net_params(), net_file, -1)
            net_file.close()

    print("done")
Beispiel #5
0
def launch(pa, pg_resume=None, render=False, repre='image', end='no_new_job'):

    env = environment.Env(pa, render=render, repre=repre, end=end)

    pg_learner = pg_network.PGLearner(pa)

    if pg_resume is not None:
        net_handle = open(pg_resume, 'rb')
        net_params = cPickle.load(net_handle)
        pg_learner.set_net_params(net_params)

    # ----------------------------
    print("Preparing for data...")
    # ----------------------------

    ref_discount_rews, ref_slow_down = slow_down_cdf.launch(pa,
                                                            pg_resume=None,
                                                            render=False,
                                                            plot=False,
                                                            repre=repre,
                                                            end=end)

    mean_rew_lr_curve = []
    max_rew_lr_curve = []
    slow_down_lr_curve = []

    timer_start = time.time()

    for iteration in range(pa.num_epochs):

        all_ob = []
        all_action = []
        all_adv = []
        all_eprews = []
        all_eplens = []
        all_slowdown = []
        all_entropy = []

        # go through all examples
        for ex in range(pa.num_ex):

            # Collect trajectories until we get timesteps_per_batch total timesteps
            trajs = []

            for i in range(pa.num_seq_per_batch):
                traj = get_traj(pg_learner, env, pa.episode_max_length)
                trajs.append(traj)

            # roll to next example
            env.seq_no = (env.seq_no + 1) % env.pa.num_ex

            all_ob.append(concatenate_all_ob(trajs, pa))

            # Compute discounted sums of rewards
            rets = [discount(traj["reward"], pa.discount) for traj in trajs]
            maxlen = max(len(ret) for ret in rets)
            padded_rets = [
                np.concatenate([ret, np.zeros(maxlen - len(ret))])
                for ret in rets
            ]

            # Compute time-dependent baseline
            baseline = np.mean(padded_rets, axis=0)

            # Compute advantage function
            advs = [ret - baseline[:len(ret)] for ret in rets]
            all_action.append(
                np.concatenate([traj["action"] for traj in trajs]))
            all_adv.append(np.concatenate(advs))

            all_eprews.append(
                np.array([
                    discount(traj["reward"], pa.discount)[0] for traj in trajs
                ]))  # episode total rewards
            all_eplens.append(np.array([len(traj["reward"])
                                        for traj in trajs]))  # episode lengths

            # All Job Stat
            enter_time, finish_time, job_len = process_all_info(trajs)
            finished_idx = (finish_time >= 0)
            all_slowdown.append(
                (finish_time[finished_idx] - enter_time[finished_idx]) /
                job_len[finished_idx])

            # Action prob entropy
            all_entropy.append(np.concatenate([traj["entropy"]]))

        all_ob = concatenate_all_ob_across_examples(all_ob, pa)
        all_action = np.concatenate(all_action)
        all_adv = np.concatenate(all_adv)

        # Do policy gradient update step
        loss = pg_learner.train(all_ob, all_action, all_adv)
        eprews = np.concatenate(all_eprews)  # episode total rewards
        eplens = np.concatenate(all_eplens)  # episode lengths

        all_slowdown = np.concatenate(all_slowdown)

        all_entropy = np.concatenate(all_entropy)

        timer_end = time.time()

        print "-----------------"
        print "Iteration: \t %i" % iteration
        print "NumTrajs: \t %i" % len(eprews)
        print "NumTimesteps: \t %i" % np.sum(eplens)
        print "Loss:     \t %s" % loss
        print "MaxRew: \t %s" % np.average([np.max(rew) for rew in all_eprews])
        print "MeanRew: \t %s +- %s" % (eprews.mean(), eprews.std())
        print "MeanSlowdown: \t %s" % np.mean(all_slowdown)
        print "MeanLen: \t %s +- %s" % (eplens.mean(), eplens.std())
        print "MeanEntropy \t %s" % (np.mean(all_entropy))
        print "Elapsed time\t %s" % (timer_end - timer_start), "seconds"
        print "-----------------"

        timer_start = time.time()

        max_rew_lr_curve.append(np.average([np.max(rew)
                                            for rew in all_eprews]))
        mean_rew_lr_curve.append(eprews.mean())
        slow_down_lr_curve.append(np.mean(all_slowdown))

        if iteration % pa.output_freq == 0:
            param_file = open(
                pa.output_filename + '_' + str(iteration) + '.pkl', 'wb')
            cPickle.dump(pg_learner.get_params(), param_file, -1)
            param_file.close()

            slow_down_cdf.launch(pa,
                                 pa.output_filename + '_' + str(iteration) +
                                 '.pkl',
                                 render=False,
                                 plot=True,
                                 repre=repre,
                                 end=end)

            plot_lr_curve(pa.output_filename, max_rew_lr_curve,
                          mean_rew_lr_curve, slow_down_lr_curve,
                          ref_discount_rews, ref_slow_down)
Beispiel #6
0
def launch(pa, pg_resume=None, render=False, repre='image', end='no_new_job'):
    task_dist = Task_Dist()
    workloads = task_dist.gen_seq_workload()

    # ----------------------------
    print("Preparing for workers...")
    # ----------------------------
    #logs = open('/home/shanka/logs_packing_deeprm', 'a')
    pg_learners = []
    envs = []
    job_distribution = Dist()
    nw_len_seqs, nw_size_seqs = job_distribution.generate_sequence_work(
        pa, seed=42)

    for ex in range(pa.num_ex):

        print("-prepare for env-", ex)

        env = environment.Env(pa,
                              nw_len_seqs=nw_len_seqs,
                              nw_size_seqs=nw_size_seqs,
                              render=False,
                              repre=repre,
                              end=end)
        env.seq_no = ex
        envs.append(env)

    for ex in range(pa.batch_size +
                    1):  # last worker for updating the parameters

        print("-prepare for worker-", ex)

        pg_learner = pg_network.PGLearner(pa)

        if pg_resume is not None:
            net_handle = open(pg_resume, 'rb')
            net_params = pickle.load(net_handle)
            pg_learner.set_net_params(net_params)

        pg_learners.append(pg_learner)

    accums = init_accums(pg_learners[pa.batch_size])

    # --------------------------------------
    # print("Preparing for reference data...")
    # --------------------------------------
    # print('Start testing...')

    # for ite in range(10,1000,10):
    # 	pg_resume = pa.output_filename +'_'+str(ite)+'.pkl'

    # 	logline=test(ite,pa, pg_resume,workloads,repre)

    # 	logs.write(logline)
    # 	logs.flush()

    # 	os.fsync(logs.fileno())
    # return

    # ref_discount_rews, ref_slow_down = slow_down_cdf.launch(pa, pg_resume=None, render=False, plot=False, repre=repre, end=end)
    # mean_rew_lr_curve = []
    # max_rew_lr_curve = []
    # slow_down_lr_curve = []

    # --------------------------------------
    print("Start training...")
    # --------------------------------------

    timer_start = time.time()

    for iteration in range(1, pa.num_epochs):

        ps = []  # threads
        manager = Manager()  # managing return results
        manager_result = manager.list([])

        ex_indices = range(pa.num_ex)
        #    np.random.shuffle(ex_indices)

        all_eprews = []
        grads_all = []
        loss_all = []
        eprews = []
        eplens = []
        all_slowdown = []
        all_entropy = []

        ex_counter = 0
        for ex in range(pa.num_ex):

            ex_idx = ex_indices[ex]
            p = Process(target=get_traj_worker,
                        args=(
                            pg_learners[ex_counter],
                            envs[ex_idx],
                            pa,
                            manager_result,
                        ))
            ps.append(p)

            ex_counter += 1

            if ex_counter >= pa.batch_size or ex == pa.num_ex - 1:

                print(ex, "out of", pa.num_ex)

                ex_counter = 0

                for p in ps:
                    p.start()

                for p in ps:
                    p.join()

                result = []  # convert list from shared memory
                for r in manager_result:
                    result.append(r)

                ps = []
                manager_result = manager.list([])

                all_ob = concatenate_all_ob_across_examples(
                    [r["all_ob"] for r in result], pa)
                all_action = np.concatenate([r["all_action"] for r in result])
                all_adv = np.concatenate([r["all_adv"] for r in result])

                # Do policy gradient update step, using the first agent
                # put the new parameter in the last 'worker', then propagate the update at the end
                grads = pg_learners[pa.batch_size].get_grad(
                    all_ob, all_action, all_adv)

                grads_all.append(grads)

                all_eprews.extend([r["all_eprews"] for r in result])

                eprews.extend(np.concatenate([r["all_eprews"] for r in result
                                              ]))  # episode total rewards
                eplens.extend(np.concatenate([r["all_eplens"] for r in result
                                              ]))  # episode lengths

        #        all_slowdown.extend(np.concatenate([r["all_slowdown"] for r in result]))
        #        all_entropy.extend(np.concatenate([r["all_entropy"] for r in result]))

        # assemble gradients
        grads = grads_all[0]
        for i in range(1, len(grads_all)):
            for j in range(len(grads)):
                grads[j] += grads_all[i][j]

        # propagate network parameters to others
        params = pg_learners[pa.batch_size].get_params()

        rmsprop_updates_outside(grads, params, accums, pa.lr_rate, pa.rms_rho,
                                pa.rms_eps)

        for i in range(pa.batch_size + 1):
            pg_learners[i].set_net_params(params)

        timer_end = time.time()

        print("-----------------")
        print("Iteration: \t %i" % iteration)
        print("NumTrajs: \t %i" % len(eprews))
        print("NumTimesteps: \t %i" % np.sum(eplens))
        #  print "Loss:     \t %s" % np.mean(loss_all)
        print("MaxRew: \t %s" % np.average([np.max(rew)
                                            for rew in all_eprews]))
        print("MeanRew: \t %s +- %s" % (np.mean(eprews), np.std(eprews)))
        # print "MeanSlowdown: \t %s" % np.mean(all_slowdown)
        print("MeanLen: \t %s +- %s" % (np.mean(eplens), np.std(eplens)))
        #     print "MeanEntropy \t %s" % (np.mean(all_entropy))
        print("Elapsed time\t %s" % (timer_end - timer_start), "seconds")
        print("-----------------")

        #    max_rew_lr_curve.append(np.average([np.max(rew) for rew in all_eprews]))
        #    mean_rew_lr_curve.append(np.mean(eprews))
        #    slow_down_lr_curve.append(np.mean(all_slowdown))

        if iteration % pa.output_freq == 0:
            pg_resume = pa.output_filename + '_' + str(iteration) + '.pkl'
            param_file = open(pg_resume, 'wb')
            pickle.dump(pg_learner.get_params(), param_file, -1)
            param_file.close()
            test(pa, pg_resume, workloads, repre)
Beispiel #7
0
def test(it, pa, pg_resume, workloads, episode_max_length=200):
    repre = 'image'
    end = 'all_done'
    agent = Heuristic_Agents()
    pg_learner = pg_network.PGLearner(pa)
    if pg_resume is not None:
        net_handle = open(pg_resume, 'rb')
        net_params = pickle.load(net_handle)
        pg_learner.set_net_params(net_params)

    new_env = Env1(0, 1)
    job_distribution = Dist()
    nw_len_seqs, nw_size_seqs = job_distribution.generate_sequence_work(
        pa, seed=42)
    logline = str(it) + '\n'
    for ex in range(pa.num_test_ex):
        env = environment.Env(pa,
                              nw_len_seqs=nw_len_seqs,
                              nw_size_seqs=nw_size_seqs,
                              render=False,
                              repre=repre,
                              end=end)
        env.seq_no = ex + pa.num_ex
        new_env.reset()
        new_env.workload_seq = workloads[ex + pa.num_ex]

        new_env.generate_workload()

        print('Testing : ', new_env.workload_seq)
        env.reset()

        obs = []
        new_obs = []
        acts = []
        new_acts = []
        rews = []
        utils = ''
        suffer = []
        new_rews = []
        entropy = []
        finished_episode_len = 0
        crs = [0] * pa.num_machines
        crs_max = [0] * pa.num_machines
        info = []
        new_ob = new_env.observe()
        ob = env.observe()
        counter = 0
        for _ in range(200):

            act_prob = pg_learner.get_one_act_prob(ob)
            a = np.argmax(act_prob)

            act = agent.get_action(new_env, a)

            new_obs.append(new_ob)
            new_acts.append(act)
            new_ob, new_rews, done1, info1 = new_env.step(act, _, new_rews, a)
            #a = (csprob_n > np.random.rand()).argmax()
            #np.set_printoptions(linewidth=40*5, precision = 2, threshold=np.nan)
            #    print('State>>',ob)
            #    print('Action>>',a)
            obs.append(ob)  # store the ob at current decision making step
            acts.append(a)

            ob, rew, done, info = env.step(a, repeat=True)
            counter += 1
            if info == 'Allocation_Success':
                finished_episode_len = _ + 1
        #    print('Reward>>',rew)
            rews.append(rew)
            entropy.append(get_entropy(act_prob))

            if done1: break

            util = ''
            for k, machine in enumerate(new_env.machines):
                if len(machine.running_tasks) > 0:
                    if machine.cpus_left >= 0:
                        util += str(machine.total_cpus -
                                    machine.cpus_left) + ','
                    else:
                        util += str(machine.total_cpus) + ','
                        suffer.append(abs(machine.cpus_left))
                else:
                    util += str(0) + ','

                crs_this_time = [0] * pa.num_machines
                for i in range(len(machine.running_tasks)):
                    for j in range(i + 1, len(machine.running_tasks)):
                        task_i, task_j = machine.running_tasks[
                            i], machine.running_tasks[j]
                        if task_i != task_j:
                            crs[k] += pa.interference_penalty * (
                                task_i.cpu_util[-1] *
                                task_j.cpu_util[-1]) * (-1)
                            crs_this_time[k] += pa.interference_penalty * (
                                task_i.cpu_util[-1] *
                                task_j.cpu_util[-1]) * (-1)
                crs_max[k] = max(crs_max[k], crs_this_time[k])
                #################
            utils += util + '|'

        logline += str(
            str(counter - 1) +
            '|' + str(utils) + str(finished_episode_len)) + '\n' + str(
                sum(new_rews)) + '\n' + str(sum(suffer)) + '\n'
        for i in range(len(new_env.machines)):
            logline += str(crs[i]) + ','
        logline = logline[:-1] + '\n'
        for i in range(len(new_env.machines)):
            logline += str(crs_max[i]) + ','
        logline = logline[:-1]
        logline += '\n'

        print('Iteration number ', it)
        print('Example No:,', ex)
        print('Test Actions : ', new_acts)

        print('Reward : ', new_rews)
        print('Total reward : ', sum(new_rews))
    return logline
def launch(pa, pg_resume=None, render=True, repre='image', end='no_new_job'):

    f = open('log/re_log_' + datetime.now().strftime('%Y-%m-%d_%H:%M:%S'), 'a')

    env = environment.Env(pa, render=render, repre=repre, end=end)

    pg_learner = pg_network.PGLearner(pa)

    startIdx = 0
    if pg_resume is not None:  # and 're' in pg_resume:
        net_handle = open(pg_resume, 'rb')
        net_params = cPickle.load(net_handle)
        pg_learner.set_net_params(net_params)
        tmp = re.match('.+?(\d+).+', pg_resume)
        startIdx = int(tmp.group(1))

    # ----------------------------
    print("\nPreparing for data...")
    # ----------------------------

    ref_discount_rews, ref_slow_down = slow_down_cdf.launch(pa,
                                                            pg_resume=None,
                                                            render=True,
                                                            plot=False,
                                                            repre=repre,
                                                            end=end)

    mean_rew_lr_curve = []
    max_rew_lr_curve = []
    slow_down_lr_curve = []

    timer_start = time.time()

    print("\nStart reinforcement learning...")

    for iteration in xrange(startIdx, pa.num_epochs):

        all_ob = []
        all_action = []
        all_adv = []
        all_eprews = []
        all_eplens = []
        all_slowdown = []
        all_entropy = []

        # go through all examples
        for ex in xrange(pa.num_ex):

            # Collect trajectories until we get timesteps_per_batch total timesteps
            trajs = []

            for i in xrange(pa.num_seq_per_batch):
                traj = get_traj(pg_learner, env, pa.episode_max_length)
                trajs.append(traj)

            # roll to next example
            env.seq_no = (env.seq_no + 1) % env.pa.num_ex

            all_ob.append(concatenate_all_ob(trajs, pa))

            # Compute discounted sums of rewards
            rets = [discount(traj["reward"], pa.discount) for traj in trajs]
            maxlen = max(len(ret) for ret in rets)
            padded_rets = [
                np.concatenate([ret, np.zeros(maxlen - len(ret))])
                for ret in rets
            ]

            # Compute time-dependent baseline
            baseline = np.mean(padded_rets, axis=0)

            # Compute advantage function
            advs = [ret - baseline[:len(ret)] for ret in rets]
            all_action.append(
                np.concatenate([traj["action"] for traj in trajs]))
            all_adv.append(np.concatenate(advs))

            all_eprews.append(
                np.array([
                    discount(traj["reward"], pa.discount)[0] for traj in trajs
                ]))  # episode total rewards
            all_eplens.append(np.array([len(traj["reward"])
                                        for traj in trajs]))  # episode lengths

            # All Job Stat
            enter_time, finish_time, job_len = process_all_info(trajs)
            finished_idx = (finish_time >= 0)
            all_slowdown.append(
                (finish_time[finished_idx] - enter_time[finished_idx]) /
                job_len[finished_idx])

            # Action prob entropy
            all_entropy.append(np.concatenate([traj["entropy"]]))

        all_ob = concatenate_all_ob_across_examples(all_ob, pa)
        all_action = np.concatenate(all_action)
        all_adv = np.concatenate(all_adv)

        # Do policy gradient update step
        loss = pg_learner.train(all_ob, all_action, all_adv)
        eprews = np.concatenate(all_eprews)  # episode total rewards
        eplens = np.concatenate(all_eplens)  # episode lengths

        all_slowdown = np.concatenate(all_slowdown)

        all_entropy = np.concatenate(all_entropy)

        timer_end = time.time()

        print "-----------------"
        print "Iteration: \t %i" % iteration
        print "NumTrajs: \t %i" % len(eprews)
        print "NumTimesteps: \t %i" % np.sum(eplens)
        print "Loss:     \t %s" % loss
        print "MaxRew: \t %s" % np.average([np.max(rew) for rew in all_eprews])
        print "MeanRew: \t %s +- %s" % (eprews.mean(), eprews.std())
        print "MeanSlowdown: \t %s" % np.mean(all_slowdown)
        print "MeanLen: \t %s +- %s" % (eplens.mean(), eplens.std())
        print "MeanEntropy \t %s" % (np.mean(all_entropy))
        print "Elapsed time\t %s" % (timer_end - timer_start), "seconds"
        print "-----------------"

        f.write("-----------------\n")
        f.write("Iteration: \t %i\n" % (iteration))
        f.write("NumTrajs: \t %i\n" % (len(eprews)))
        f.write("NumTimesteps: \t %i\n" % (np.sum(eplens)))
        f.write("Loss:     \t %s\n".format(loss))
        f.write("MaxRew: \t %s\n" %
                (np.average([np.max(rew) for rew in all_eprews])))
        f.write("MeanRew: \t %s +- %s\n" % (np.mean(eprews), np.std(eprews)))
        f.write("MeanSlowdown: \t %s\n" % (np.mean(all_slowdown)))
        f.write("MeanLen: \t %s +- %s\n" % (np.mean(eplens), np.std(eplens)))
        f.write("MeanEntropy \t %s\n" % ((np.mean(all_entropy))))
        f.write("Elapsed time\t %s seconds\n" % ((timer_end - timer_start)))
        f.write("-----------------\n")
        f.close()

        timer_start = time.time()

        max_rew_lr_curve.append(np.average([np.max(rew)
                                            for rew in all_eprews]))
        mean_rew_lr_curve.append(eprews.mean())
        slow_down_lr_curve.append(np.mean(all_slowdown))

        if iteration % pa.output_freq == 0:
            param_file = open(
                pa.output_filename + '_' + str(iteration) + '.pkl', 'wb')
            cPickle.dump(pg_learner.get_params(), param_file, -1)
            param_file.close()

            # added by wjchen, to record accuracy and rewards
            sample_file = h5py.File('log/re_record_iter'+str(len(slow_down_lr_curve))\
                                    + datetime.now().strftime('%Y-%m-%d_%H:%M')+'.h5', 'w+')
            sample_file.create_dataset('max_rew_lr_curve',
                                       data=max_rew_lr_curve)
            sample_file.create_dataset('mean_rew_lr_curve',
                                       data=mean_rew_lr_curve)
            sample_file.create_dataset('slow_down_lr_curve',
                                       data=slow_down_lr_curve)
            #
            ref_dr = sample_file.create_group('ref_discount_rews')
            for k, v in ref_discount_rews.items():
                ref_dr[k] = np.average(v)
            #
            ref_sd = sample_file.create_group('ref_slow_down')
            for k, v in ref_slow_down.items():
                ref_sd[k] = np.average(np.concatenate(v))
            sample_file.close()

            slow_down_cdf.launch(pa,
                                 pa.output_filename + '_' + str(iteration) +
                                 '.pkl',
                                 render=True,
                                 plot=True,
                                 repre=repre,
                                 end=end)

            plot_lr_curve(pa.output_filename, max_rew_lr_curve,
                          mean_rew_lr_curve, slow_down_lr_curve,
                          ref_discount_rews, ref_slow_down)
Beispiel #9
0
def launch(pa, pg_resume=None, render=False, repre='image', end='no_new_job'):

    # ----------------------------
    print("Preparing for workers...")
    # ----------------------------

    pg_learners = []
    envs = []
    plot_maker = plot.PlotMaker(pa)

    for ex in range(pa.num_ex):
        env = environment.Env(pa, render=False, repre=repre, end=end)
        env.seq_no = ex
        envs.append(env)

        # for ex in range(pa.batch_size + 1):  # last worker for updating the parameters
        #
        #     print "-prepare for worker-", ex
        #
        pg_learner = pg_network.PGLearner(pa)
        #
        #     if pg_resume is not None:
        #         net_handle = open(pg_resume, 'rb')
        #         net_params = cPickle.load(net_handle)
        #         pg_learner.set_net_params(net_params)
        #
        pg_learners.append(pg_learner)

    # accums = init_accums(pg_learners[pa.batch_size])

    # --------------------------------------
    print("Preparing for reference data...")
    # --------------------------------------

    # ref_discount_rews, ref_slow_down = slow_down_cdf.launch(pa, pg_resume=None, render=False, plot=False, repre=repre, end=end)
    # mean_rew_lr_curve = []
    # max_rew_lr_curve = []
    # slow_down_lr_curve = []

    # --------------------------------------
    print("Start training...")
    # --------------------------------------

    for iteration in range(1, pa.num_epochs):
        timer_start = time.time()
        with open('somefile.txt', 'a') as the_file:
            the_file.write("----------------Iteration %d----------------\n" %
                           iteration)
        ps = []  # threads
        manager = Manager()  # managing return results
        manager_result = manager.list([])

        ex_indices = np.arange(pa.num_ex)
        np.random.shuffle(ex_indices)

        all_eprews = []
        grads_all = []
        loss_all = []
        eprews = []
        eplens = []
        all_slowdown = []
        all_entropy = []

        ex_counter = 0
        for ex in range(pa.num_ex):
            # print(ex)
            ex_idx = ex_indices[ex]
            # p = Process(target=get_traj,
            #             args=(pg_learners[ex_counter], envs[ex_idx], pa.episode_max_length, manager_result,))
            #
            # ps.append(p)
            # ex_counter += 1
            #
            # if ex_counter >= pa.batch_size or ex == pa.num_ex - 1:
            #
            #     ex_counter = 0
            #     for p in ps:
            #         p.start()
            #
            #     for p in ps:
            #         p.join()

            result = []  # convert list from shared memory

            ps = []
            result = []
            get_traj_worker(
                pg_learners[ex_counter],
                envs[ex_idx],
                pa,
                manager_result,
            )
            for r in manager_result:
                result.append(r)
            # print(len(result))
            manager_result = manager.list([])
            # print("ok")
            all_ob = concatenate_all_ob_across_examples(
                [r["all_ob"] for r in result], pa)
            all_action = np.concatenate([r["all_action"] for r in result])
            all_machine = np.concatenate([r["all_machine"] for r in result])
            all_adv = np.concatenate([r["all_adv"] for r in result])
            pg_learners[0].fit(all_ob, all_action, all_machine, all_adv)

            all_eprews.extend([r["all_eprews"] for r in result])
            # print(all_eprews)
            eprews.extend(np.concatenate([r["all_eprews"] for r in result
                                          ]))  # episode total rewards
            eplens.extend(np.concatenate([r["all_eplens"]
                                          for r in result]))  # episode lengths
            all_slowdown.extend(
                np.concatenate([r["all_slowdown"] for r in result]))
            all_entropy.extend(
                np.concatenate([r["all_entropy"] for r in result]))
            #train the first agent

            timer_end = time.time()
            # print(len(all_slowdown))
            # print(all_slowdown)
            # MARK: changed
            slowdown_all_in_one = np.concatenate(all_slowdown)
            print(slowdown_all_in_one.shape)
            print("-----------------")
            print("Iteration: \t %i" % iteration)
            print("NumTrajs: \t %i" % len(eprews))
            print("NumTimesteps: \t %i" % np.sum(eplens))
            print("Loss:     \t %s" % np.mean(loss_all))
            print("MaxRew: \t %s" %
                  np.average([np.max(rew) for rew in all_eprews]))
            print("MeanRew: \t %s +- %s" % (np.mean(eprews), np.std(eprews)))
            print("MeanSlowdown: \t %s" % np.mean(slowdown_all_in_one))
            print("MeanLen: \t %s +- %s" % (np.mean(eplens), np.std(eplens)))
            print("MeanEntropy \t %s" % (np.mean(all_entropy)))
            print("Elapsed time\t %s" % (timer_end - timer_start), "seconds")
            print("-----------------")

            plot_maker.slow_down_records.append(all_slowdown)
            with open('somefile.txt', 'a') as the_file:

                the_file.write("MeanRew: \t %s +- %s\n" %
                               (np.mean(eprews), np.std(eprews)))
                the_file.write("MeanSlowdown: \t %s\n-----------------\n\n" %
                               np.mean(slowdown_all_in_one))

        #TODO: set paramaetes for other agents

        # for i in xrange(pa.batch_size + 1):
        #     pg_learners[i].set_net_params(params)

        timer_end = time.time()

        # print "-----------------"
        # print "Iteration: \t %i" % iteration
        # print "NumTrajs: \t %i" % len(eprews)
        # print "NumTimesteps: \t %i" % np.sum(eplens)
        # # print "Loss:     \t %s" % np.mean(loss_all)
        # print "MaxRew: \t %s" % np.average([np.max(rew) for rew in all_eprews])
        # print "MeanRew: \t %s +- %s" % (np.mean(eprews), np.std(eprews))
        # print "MeanSlowdown: \t %s" % np.mean(all_slowdown)
        # print "MeanLen: \t %s +- %s" % (np.mean(eplens), np.std(eplens))
        # print "MeanEntropy \t %s" % (np.mean(all_entropy))
        # print "Elapsed time\t %s" % (timer_end - timer_start), "seconds"
        # print "-----------------"
        #
        # timer_start = time.time()
        #
        # max_rew_lr_curve.append(np.average([np.max(rew) for rew in all_eprews]))
        # mean_rew_lr_curve.append(np.mean(eprews))
        # slow_down_lr_curve.append(np.mean(all_slowdown))
        #
        # if iteration % pa.output_freq == 0:
        #     param_file = open(pa.output_filename + '_' + str(iteration) + '.pkl', 'wb')
        #     cPickle.dump(pg_learners[pa.batch_size].get_params(), param_file, -1)
        #     param_file.close()
        #
        #     pa.unseen = True
        #     slow_down_cdf.launch(pa, pa.output_filename + '_' + str(iteration) + '.pkl',
        #                          render=False, plot=True, repre=repre, end=end)
        #     pa.unseen = False
        #     # test on unseen examples
        #
        #     plot_lr_curve(pa.output_filename,
        #                   max_rew_lr_curve, mean_rew_lr_curve, slow_down_lr_curve,
        #                   ref_discount_rews, ref_slow_down)
    plot_maker.plot()
Beispiel #10
0
def main():

    pa = parameters.Parameters()

    type_exp = 'pg_re'  # 'pg_su' 'pg_su_compact' 'v_su', 'pg_v_re', 'pg_re', q_re', 'test'

    pg_resume = None
    v_resume = None
    q_resume = None
    log = None

    render = False
    plot = False

    try:
        opts, args = getopt.getopt(sys.argv[1:], "hi:o:", [
            "exp_type=", "num_res=", "num_nw=", "simu_len=", "num_ex=",
            "num_seq_per_batch=", "eps_max_len=", "num_epochs=",
            "time_horizon=", "res_slot=", "max_job_len=", "max_job_size=",
            "new_job_rate=", "dist=", "lr_rate=", "ba_size=", "pg_re=",
            "v_re=", "q_re=", "out_freq=", "ofile=", "log=", "render=",
            "unseen=", "plot="
        ])

    except getopt.GetoptError:
        script_usage()
        sys.exit(2)

    for opt, arg in opts:
        if opt == '-h':
            script_usage()
            sys.exit()
        elif opt in ("-e", "--exp_type"):
            type_exp = arg
        elif opt in ("-n", "--num_res"):
            pa.num_resources = int(arg)
        elif opt in ("-w", "--num_nw"):
            pa.num_nw = int(arg)
        elif opt in ("-s", "--simu_len"):
            pa.simu_len = int(arg)
        elif opt in ("-n", "--num_ex"):
            pa.num_ex = int(arg)
        elif opt in ("-sp", "--num_seq_per_batch"):
            pa.num_seq_per_batch = int(arg)
        elif opt in ("-el", "--eps_max_len"):
            pa.episode_max_length = int(arg)
        elif opt in ("-ne", "--num_epochs"):
            pa.num_epochs = int(arg)
        elif opt in ("-t", "--time_horizon"):
            pa.time_horizon = int(arg)
        elif opt in ("-rs", "--res_slot"):
            pa.res_slot = int(arg)
        elif opt in ("-ml", "--max_job_len"):
            pa.max_job_len = int(arg)
        elif opt in ("-ms", "--max_job_size"):
            pa.max_job_size = int(arg)
        elif opt in ("-nr", "--new_job_rate"):
            pa.new_job_rate = float(arg)
        elif opt in ("-d", "--dist"):
            pa.discount = float(arg)
        elif opt in ("-l", "--lr_rate"):
            pa.lr_rate = float(arg)
        elif opt in ("-b", "--ba_size"):
            pa.batch_size = int(arg)
        elif opt in ("-p", "--pg_re"):
            pg_resume = arg
        elif opt in ("-v", "--v_re"):
            v_resume = arg
        elif opt in ("-q", "--q_re"):
            q_resume = arg
        elif opt in ("-f", "--out_freq"):
            pa.output_freq = int(arg)
        elif opt in ("-o", "--ofile"):
            pa.output_filename = arg
        elif opt in ("-lg", "--log"):
            log = arg
        elif opt in ("-r", "--render"):
            render = (arg == 'True')
        elif opt in ("-pl", "--plot"):
            plot = (arg == 'True')
        elif opt in ("-u", "--unseen"):
            pa.generate_unseen = (arg == 'True')
        else:
            script_usage()
            sys.exit()

    if log is not None:
        orig_stdout = sys.stdout
        f = open(log, 'w')
        sys.stdout = f
    if pg_resume is None:
        print("PG resume is empty!")
        sys.exit(1)

    pa.compute_dependent_parameters()
    repre = 'image'
    end = 'all_done'
    env = environment.Env(pa, render=render, repre=repre, end=end)
    pg_learner = pg_network.PGLearner(pa)
    net_handle = open(pg_resume, 'rb')
    net_params = cPickle.load(net_handle)
    pg_learner.set_net_params(net_params)
    outputFileName = pa.output_filename + '_' + \
        ntpath.basename(pg_resume) + '_test.pkl'
    pg_learner.write_net_to_nnet(outputFileName)
    nnetFilename = outputFileName + '.nnet'
    r = nnet.NNet(nnetFilename)
    smallHigherBound = 1.0e-10
    smallLowerBound = -1.0e-10
    for wIdx, w in enumerate(r.weights):
        smallRows = np.all((w <= smallHigherBound) & (w >= smallLowerBound),
                           axis=1)
        smallRowsIndices = np.where(smallRows == True)
        for row in smallRowsIndices[0]:
            rowBias = r.biases[wIdx][row]
            #add it's bias to all biasses of next layer since we assume fully connected
            if ((wIdx + 1) < len(r.biases)):
                r.biases[wIdx + 1] = r.biases[wIdx + 1] + rowBias
        #Now we delete the lines with 'dead' neurons
        r.weights[wIdx] = np.delete(w, smallRowsIndices[0], axis=0)
        if ((wIdx + 1) < len(r.weights)):
            r.weights[wIdx + 1] = np.delete(r.weights[wIdx + 1],
                                            smallRowsIndices[0],
                                            axis=1)
        r.biases[wIdx] = np.delete(r.biases[wIdx], smallRowsIndices[0], axis=0)

    # now we export the file once again after the 'dead' neuron filtration
    for wIdx, w in enumerate(r.weights):
        r.weights[wIdx] = r.weights[wIdx].transpose()
    writeNNet.writeNNet(r.weights, r.biases, r.mins, r.maxes, r.means,
                        r.ranges, outputFileName + '_cleaned_' + '.nnet')
    if log is not None:
        sys.stdout = orig_stdout
        f.close()
Beispiel #11
0
def launch(pa, pg_resume=None, render=False, repre='image', end='no_new_job'):

    # ----------------------------
    print("Preparing for workers...")
    # ----------------------------

    pg_learners = []
    envs = []

    nw_len_seqs, nw_size_seqs = job_distribution.generate_sequence_work(
        pa, seed=42)

    for ex in xrange(pa.num_ex):  # number of sequences

        print "-prepare for env-", ex

        env = environment.Env(pa,
                              nw_len_seqs=nw_len_seqs,
                              nw_size_seqs=nw_size_seqs,
                              render=True,
                              repre=repre,
                              end=end)
        env.seq_no = ex
        envs.append(env)

    for ex in xrange(pa.batch_size +
                     1):  # last worker for updating the parameters

        print "-prepare for worker-", ex

        pg_learner = pg_network.PGLearner(pa)

        startIndex = 0
        if pg_resume is not None:
            net_handle = open(pg_resume, 'rb')
            net_params = cPickle.load(net_handle)
            pg_learner.set_net_params(net_params)
            startIndex = re.match(pg_resume, '\d+').group()
            startIndex = int(startIndex)

        pg_learners.append(pg_learner)

    accums = init_accums(pg_learners[pa.batch_size])

    # --------------------------------------
    print("Preparing for reference data...")
    # --------------------------------------

    # Reference examples, get reference discounted rewards and reference slowdown from random, SJF and Tetris algorithms
    ref_discount_rews, ref_slow_down = slow_down_cdf.launch(pa,
                                                            pg_resume=None,
                                                            render=True,
                                                            plot=False,
                                                            repre=repre,
                                                            end=end)
    mean_rew_lr_curve = []
    max_rew_lr_curve = []
    slow_down_lr_curve = []

    # --------------------------------------
    print("Start training...")
    # --------------------------------------

    timer_start = time.time()

    for iteration in xrange(startIndex, pa.num_epochs):

        ps = []  # threads
        manager = Manager()  # managing return results
        manager_result = manager.list([])

        ex_indices = range(pa.num_ex)
        np.random.shuffle(ex_indices)

        all_eprews = []
        grads_all = []
        loss_all = []
        eprews = []
        eplens = []
        all_slowdown = []
        all_entropy = []

        ex_counter = 0
        for ex in xrange(pa.num_ex):

            ex_idx = ex_indices[ex]
            p = Process(target=get_traj_worker,
                        args=(
                            pg_learners[ex_counter],
                            envs[ex_idx],
                            pa,
                            manager_result,
                        ))
            ps.append(p)

            ex_counter += 1
            # append pa.num_ex number of Processes in ps until going inside if
            if ex_counter >= pa.batch_size or ex == pa.num_ex - 1:

                print ex + 1, "out of", pa.num_ex

                ex_counter = 0

                for p in ps:
                    p.start()

                for p in ps:
                    p.join()

                result = []  # convert list from shared memory
                for r in manager_result:
                    result.append(r)

                ps = []
                manager_result = manager.list([])

                all_ob = concatenate_all_ob_across_examples(
                    [r["all_ob"] for r in result], pa)
                all_action = np.concatenate([r["all_action"] for r in result])
                all_adv = np.concatenate([r["all_adv"] for r in result])

                # Do policy gradient update step, using the first agent
                # put the new parameter in the last 'worker', then propagate the update at the end
                grads = pg_learners[pa.batch_size].get_grad(
                    all_ob, all_action, all_adv)  #(states, actions, values)

                grads_all.append(grads)

                all_eprews.extend([r["all_eprews"] for r in result])

                eprews.extend(np.concatenate([r["all_eprews"] for r in result
                                              ]))  # episode total rewards
                eplens.extend(np.concatenate([r["all_eplens"] for r in result
                                              ]))  # episode lengths

                all_slowdown.extend(
                    np.concatenate([r["all_slowdown"] for r in result]))
                all_entropy.extend(
                    np.concatenate([r["all_entropy"] for r in result]))

        # assemble gradients
        grads = grads_all[0]
        for i in xrange(1, len(grads_all)):
            for j in xrange(len(grads)):
                grads[j] += grads_all[i][j]

        # propagate network parameters to others
        params = pg_learners[pa.batch_size].get_params()

        rmsprop_updates_outside(grads, params, accums, pa.lr_rate, pa.rms_rho,
                                pa.rms_eps)

        for i in xrange(pa.batch_size + 1):
            pg_learners[i].set_net_params(params)

        timer_end = time.time()

        print "-----------------"
        print "Iteration: \t %i" % iteration
        print "NumTrajs: \t %i" % len(eprews)
        print "NumTimesteps: \t %i" % np.sum(eplens)
        # print "Loss:     \t %s" % np.mean(loss_all)
        print "MaxRew: \t %s" % np.average([np.max(rew) for rew in all_eprews])
        print "MeanRew: \t %s +- %s" % (np.mean(eprews), np.std(eprews))
        print "MeanSlowdown: \t %s" % np.mean(all_slowdown)
        print "MeanLen: \t %s +- %s" % (np.mean(eplens), np.std(eplens))
        print "MeanEntropy \t %s" % (np.mean(all_entropy))
        print "Elapsed time\t %s" % (timer_end - timer_start), "seconds"
        print "-----------------"

        f = open('log/re_log_' + datetime.now().strftime('%Y-%m-%d_%H:%M:%S'),
                 'w+')
        f.write("-----------------\n")
        f.write("Iteration: \t %i\n" % (iteration))
        f.write("NumTrajs: \t %i\n" % (len(eprews)))
        f.write("NumTimesteps: \t %i\n" % (np.sum(eplens)))
        # f.write("Loss:     \t %s\n".format(loss))
        f.write("MaxRew: \t %s\n" %
                (np.average([np.max(rew) for rew in all_eprews])))
        f.write("MeanRew: \t %s +- %s\n" % (np.mean(eprews), np.std(eprews)))
        f.write("MeanSlowdown: \t %s\n" % (np.mean(all_slowdown)))
        f.write("MeanLen: \t %s +- %s\n" % (np.mean(eplens), np.std(eplens)))
        f.write("MeanEntropy \t %s\n" % ((np.mean(all_entropy))))
        f.write("Elapsed time\t %s seconds\n" % ((timer_end - timer_start)))
        f.write("-----------------\n")
        f.close()

        timer_start = time.time()

        max_rew_lr_curve.append(np.average([np.max(rew)
                                            for rew in all_eprews]))
        mean_rew_lr_curve.append(np.mean(eprews))
        slow_down_lr_curve.append(np.mean(all_slowdown))

        if iteration % pa.output_freq == 0:
            param_file = open(
                pa.output_filename + '_' + str(iteration) + '.pkl', 'wb')
            cPickle.dump(pg_learners[pa.batch_size].get_params(), param_file,
                         -1)
            param_file.close()

            # added by wjchen, to record accuracy and rewards
            sample_file = h5py.File('log/re_record'+str(len(slow_down_lr_curve))\
                                    + datetime.now().strftime('%Y-%m-%d_%H:%M')+'.h5', 'w')
            sample_file.create_dataset('max_rew_lr_curve',
                                       data=max_rew_lr_curve)
            sample_file.create_dataset('mean_rew_lr_curve',
                                       data=mean_rew_lr_curve)
            sample_file.create_dataset('slow_down_lr_curve',
                                       data=slow_down_lr_curve)

            ref_dr = sample_file.create_group('ref_discount_rews')
            for k, v in ref_discount_rews.items():
                ref_dr[k] = np.average(v)

            ref_sd = sample_file.create_group('ref_slow_down')
            for k, v in ref_slow_down.items():
                ref_sd[k] = np.average(np.concatenate(v))
            sample_file.close()
            # print ref_slow_down
            # print ref_discount_rews
            #
            print '\n----Reference Slowdown----'
            for k, v in ref_slow_down.items():
                print "{}: {}".format(k, np.average(np.concatenate(v)))

            print '\n----Reference Discount Reward----'
            for k, v in ref_discount_rews.items():
                print "{}: {}".format(k, np.average(v))

            pa.unseen = True
            slow_down_cdf.launch(pa,
                                 pa.output_filename + '_' + str(iteration) +
                                 '.pkl',
                                 render=True,
                                 plot=True,
                                 repre=repre,
                                 end=end)
            pa.unseen = False
            # test on unseen examples

            plot_lr_curve(pa.output_filename, max_rew_lr_curve,
                          mean_rew_lr_curve, slow_down_lr_curve,
                          ref_discount_rews, ref_slow_down
                          )  # draw average of ref_discount_rews, ref_slow_down
Beispiel #12
0
def get_traj_halluc(test_type,
                    pa,
                    env,
                    episode_max_length,
                    pg_resume=None,
                    render=False):
    """
    Run agent-environment loop for one whole episode (trajectory)
    Return dictionary of results
    """

    if test_type == 'PG':  # load trained parameters

        pg_learner = pg_network.PGLearner(pa)

        net_handle = open(pg_resume, 'rb')
        net_params = cPickle.load(net_handle)
        pg_learner.set_net_params(net_params)

    env.reset()
    rews = []

    rnn_tmp = env.rnn

    for te in xrange(episode_max_length):

        env.rnn = None
        ori_env = copy.deepcopy(env)
        actions = []
        future = min(episode_max_length - te, pa.simu_len)
        rews_hals = np.zeros((pa.num_hal, future), dtype=float)

        if pa.rnn:
            rnn_tmp.forecast_from_history()

        for h in range(pa.num_hal):
            new_env = copy.deepcopy(ori_env)
            new_env.rnn = rnn_tmp

            if pa.rnn:
                new_env.replace_backlog_from_rnn()

            ob = new_env.observe()

            for th in range(future):

                if test_type == 'PG':
                    a = pg_learner.choose_action(ob)

                elif test_type == 'Tetris':
                    a = other_agents.get_packer_action(new_env.machine,
                                                       new_env.job_slot)

                elif test_type == 'SJF':
                    a = other_agents.get_sjf_action(new_env.machine,
                                                    new_env.job_slot)

                elif test_type == 'Random':
                    a = other_agents.get_random_action(new_env.job_slot)

                if th == 0:
                    actions.append(a)

                ob, rew, done, info = new_env.step(
                    a, repeat=True, forecasting=(pa.rnn == True))

                if done: break

                rews_hals[h][th] = rew

        sum_rews = rews_hals.sum(axis=1, dtype=float)

        a_best = actions[np.argmax(sum_rews)]
        working_env = copy.deepcopy(ori_env)
        working_env.rnn = rnn_tmp

        if pa.rnn:
            ob, rew, done, info, new_job_list = working_env.step(
                a_best, repeat=True, return_raw_jobs=True)

            for new_job in new_job_list:
                working_env.rnn.update_history(new_job)

        else:
            ob, rew, done, info = working_env.step(a_best, repeat=True)

        rews.append(rew)

        if done: break
        if render: working_env.render()
        # env.render()

    env.rnn = rnn_tmp
    return np.array(rews), info