Example #1
0
def launch(pa, pg_resume=None, render=False, repre='image', end='no_new_job'):

    # ----------------------------
    print("Preparing for workers...")
    # ----------------------------

    pg_learners = []
    envs = []

    nw_len_seqs, nw_size_seqs = job_distribution.generate_sequence_work(
        pa, seed=42)

    ### create sequence of environments for each of the num_ex job sets/sequences
    for ex in xrange(pa.num_ex):

        print "-prepare for env-", ex

        env = environment.Env(pa,
                              nw_len_seqs=nw_len_seqs,
                              nw_size_seqs=nw_size_seqs,
                              render=False,
                              repre=repre,
                              end=end)
        env.seq_no = ex
        envs.append(env)

    ### generate sequence of NNs for each batch, each of which is a a policy gradient agent
    for ex in xrange(pa.batch_size +
                     1):  # last worker for updating the parameters

        print "-prepare for worker-", ex

        pg_learner = pg_network.PGLearner(pa)

        if pg_resume is not None:
            net_handle = open(pg_resume, 'rb')
            net_params = cPickle.load(net_handle)
            pg_learner.set_net_params(net_params)

        pg_learners.append(pg_learner)

    accums = init_accums(pg_learners[pa.batch_size])

    # --------------------------------------
    print("Preparing for reference data...")
    # --------------------------------------

    ref_discount_rews, ref_slow_down = slow_down_cdf.launch(pa,
                                                            pg_resume=None,
                                                            render=False,
                                                            plot=False,
                                                            repre=repre,
                                                            end=end)
    mean_rew_lr_curve = []
    max_rew_lr_curve = []
    slow_down_lr_curve = []

    # --------------------------------------
    print("Start training...")
    # --------------------------------------

    timer_start = time.time()

    for iteration in xrange(1, pa.num_epochs):

        ### use a thread for each use manager to share results across threads
        ps = []  # threads
        manager = Manager()  # managing return results
        manager_result = manager.list([])

        ex_indices = range(pa.num_ex)
        np.random.shuffle(ex_indices)

        all_eprews = []
        grads_all = []
        loss_all = []
        eprews = []
        eplens = []
        all_slowdown = []
        all_entropy = []

        ex_counter = 0

        ### for each jobset
        for ex in xrange(pa.num_ex):

            ex_idx = ex_indices[ex]
            ### evaluate several instances of trajectories for set of PG agents
            p = Process(target=get_traj_worker,
                        args=(
                            pg_learners[ex_counter],
                            envs[ex_idx],
                            pa,
                            manager_result,
                        ))
            ps.append(p)

            ex_counter += 1

            ##

            if ex_counter >= pa.batch_size or ex == pa.num_ex - 1:

                print ex, "out of", pa.num_ex

                ex_counter = 0

                for p in ps:
                    p.start()

                for p in ps:
                    p.join()

                result = []  # convert list from shared memory
                for r in manager_result:
                    result.append(r)

                ps = []
                manager_result = manager.list([])

                all_ob = concatenate_all_ob_across_examples(
                    [r["all_ob"] for r in result], pa)
                all_action = np.concatenate([r["all_action"] for r in result])
                all_adv = np.concatenate([r["all_adv"] for r in result])

                # Do policy gradient update step, using the first agent
                # put the new parameter in the last 'worker', then propagate the update at the end
                grads = pg_learners[pa.batch_size].get_grad(
                    all_ob, all_action, all_adv)

                grads_all.append(grads)

                all_eprews.extend([r["all_eprews"] for r in result])

                eprews.extend(np.concatenate([r["all_eprews"] for r in result
                                              ]))  # episode total rewards
                eplens.extend(np.concatenate([r["all_eplens"] for r in result
                                              ]))  # episode lengths

                all_slowdown.extend(
                    np.concatenate([r["all_slowdown"] for r in result]))
                all_entropy.extend(
                    np.concatenate([r["all_entropy"] for r in result]))

        # assemble gradients
        grads = grads_all[0]
        for i in xrange(1, len(grads_all)):
            for j in xrange(len(grads)):
                grads[j] += grads_all[i][j]

        # propagate network parameters to others
        params = pg_learners[pa.batch_size].get_params()

        rmsprop_updates_outside(grads, params, accums, pa.lr_rate, pa.rms_rho,
                                pa.rms_eps)

        for i in xrange(pa.batch_size + 1):
            pg_learners[i].set_net_params(params)

        timer_end = time.time()

        print "-----------------"
        print "Iteration: \t %i" % iteration
        print "NumTrajs: \t %i" % len(eprews)
        print "NumTimesteps: \t %i" % np.sum(eplens)
        # print "Loss:     \t %s" % np.mean(loss_all)
        print "MaxRew: \t %s" % np.average([np.max(rew) for rew in all_eprews])
        print "MeanRew: \t %s +- %s" % (np.mean(eprews), np.std(eprews))
        print "MeanSlowdown: \t %s" % np.mean(all_slowdown)
        print "MeanLen: \t %s +- %s" % (np.mean(eplens), np.std(eplens))
        print "MeanEntropy \t %s" % (np.mean(all_entropy))
        print "Elapsed time\t %s" % (timer_end - timer_start), "seconds"
        print "-----------------"

        timer_start = time.time()

        max_rew_lr_curve.append(np.average([np.max(rew)
                                            for rew in all_eprews]))
        mean_rew_lr_curve.append(np.mean(eprews))
        slow_down_lr_curve.append(np.mean(all_slowdown))

        if iteration % pa.output_freq == 0:
            param_file = open(
                pa.output_filename + '_' + str(iteration) + '.pkl', 'wb')
            cPickle.dump(pg_learners[pa.batch_size].get_params(), param_file,
                         -1)
            param_file.close()

            pa.unseen = True
            slow_down_cdf.launch(pa,
                                 pa.output_filename + '_' + str(iteration) +
                                 '.pkl',
                                 render=False,
                                 plot=True,
                                 repre=repre,
                                 end=end)
            pa.unseen = False
            # test on unseen examples

            plot_lr_curve(pa.output_filename, max_rew_lr_curve,
                          mean_rew_lr_curve, slow_down_lr_curve,
                          ref_discount_rews, ref_slow_down)
Example #2
0
def launch(pa,
           pg_resume=None,
           render=False,
           plot=False,
           repre='image',
           end='no_new_job',
           q_resume=None):

    # ---- Parameters ----

    test_types = ['LLQ', 'Random']

    if (pg_resume is not None) and (q_resume is None):
        test_types = ['PG'] + test_types

    if q_resume is not None:
        test_types = ['Q'] + test_types

    nw_len_seqs = job_distribution.generate_sequence_work(pa, seed=42)
    env = environment.Env(pa,
                          nw_len_seqs=nw_len_seqs,
                          render=render,
                          repre=repre,
                          end=end)
    # env = environment.Env(pa, render=render, repre=repre, end=end)

    all_discount_rews = {}
    jobs_slow_down = {}
    work_complete = {}
    work_remain = {}
    job_len_remain = {}
    num_job_remain = {}
    job_remain_delay = {}

    for test_type in test_types:
        all_discount_rews[test_type] = []
        jobs_slow_down[test_type] = []
        work_complete[test_type] = []
        work_remain[test_type] = []
        job_len_remain[test_type] = []
        num_job_remain[test_type] = []
        job_remain_delay[test_type] = []

    for seq_idx in xrange(pa.num_ex):
        print('\n\n')
        print("=============== " + str(seq_idx) + " ===============")

        for test_type in test_types:

            rews, info = get_traj(test_type,
                                  pa,
                                  env,
                                  pa.episode_max_length,
                                  pg_resume=pg_resume,
                                  q_agent=q_resume)

            print "---------- " + test_type + " -----------"

            print "total discount reward : \t %s" % (discount(
                rews, pa.discount)[0])

            all_discount_rews[test_type].append(discount(rews, pa.discount)[0])

            # ------------------------
            # ---- per job stat ----
            # ------------------------

            enter_time = np.array(
                [info.record[i].enter_time for i in xrange(len(info.record))])
            finish_time = np.array(
                [info.record[i].finish_time for i in xrange(len(info.record))])
            job_len = np.array(
                [info.record[i].len for i in xrange(len(info.record))])
            # job_total_size = np.array([np.sum(info.record[i].res_vec) for i in xrange(len(info.record))])

            finished_idx = (finish_time >= 0)
            unfinished_idx = (finish_time < 0)

            jobs_slow_down[test_type].append(
                (finish_time[finished_idx] - enter_time[finished_idx]) /
                job_len[finished_idx])
            work_complete[test_type].append(
                np.sum(
                    job_len[finished_idx])  # * job_total_size[finished_idx])
            )
            work_remain[test_type].append(
                np.sum(job_len[unfinished_idx]
                       )  # * job_total_size[unfinished_idx])
            )
            job_len_remain[test_type].append(np.sum(job_len[unfinished_idx]))
            num_job_remain[test_type].append(len(job_len[unfinished_idx]))
            job_remain_delay[test_type].append(
                np.sum(pa.episode_max_length - enter_time[unfinished_idx]))

        env.seq_no = (env.seq_no + 1) % env.pa.num_ex

    # -- matplotlib colormap no overlap --
    if plot:
        num_colors = len(test_types)
        cm = plt.get_cmap('gist_rainbow')
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.set_color_cycle(
            [cm(1. * i / num_colors) for i in range(num_colors)])

        for test_type in test_types:
            slow_down_cdf = np.sort(np.concatenate(jobs_slow_down[test_type]))
            slow_down_yvals = np.arange(len(slow_down_cdf)) / float(
                len(slow_down_cdf))
            ax.plot(slow_down_cdf,
                    slow_down_yvals,
                    linewidth=2,
                    label=test_type)

        plt.legend(loc=4)
        plt.xlabel("job slowdown", fontsize=20)
        plt.ylabel("CDF", fontsize=20)
        # plt.show()
        plt.savefig(pg_resume + "_slowdown_fig" + ".pdf")

    return all_discount_rews, jobs_slow_down
Example #3
0
def launch(pa, pg_resume=None, render=False, repre='image', end='no_new_job'):

    env = environment.Env(pa, render=False, repre=repre, end=end)

    pg_learner = pg_network.PGLearner(pa)

    if pg_resume is not None:
        net_handle = open(pg_resume, 'r')
        net_params = cPickle.load(net_handle)
        pg_learner.set_net_params(net_params)

    if pa.evaluate_policy_name == "SJF":
        evaluate_policy = other_agents.get_sjf_action
    elif pa.evaluate_policy_name == "PACKER":
        evaluate_policy = other_agents.get_packer_action
    else:
        print("Panic: no policy known to evaluate.")
        exit(1)

    # ----------------------------
    print("Preparing for data...")
    # ----------------------------

    nw_len_seqs, nw_size_seqs = job_distribution.generate_sequence_work(
        pa, seed=42)

    # print 'nw_time_seqs=', nw_len_seqs
    # print 'nw_size_seqs=', nw_size_seqs

    mem_alloc = 4

    X = np.zeros([
        pa.simu_len * pa.num_ex * mem_alloc, 1, pa.network_input_height,
        pa.network_input_width
    ],
                 dtype=theano.config.floatX)
    y = np.zeros(pa.simu_len * pa.num_ex * mem_alloc, dtype='int32')

    print 'network_input_height=', pa.network_input_height
    print 'network_input_width=', pa.network_input_width

    counter = 0

    for train_ex in range(pa.num_ex):

        env.reset()

        for _ in xrange(pa.episode_max_length):

            # ---- get current state ----
            ob = env.observe()

            a = evaluate_policy(env.machine, env.job_slot)

            if counter < pa.simu_len * pa.num_ex * mem_alloc:

                add_sample(X, y, counter, ob, a)
                counter += 1

            ob, rew, done, info = env.step(a, repeat=True)

            if done:  # hit void action, exit
                break

        # roll to next example
        env.seq_no = (env.seq_no + 1) % env.pa.num_ex

    num_train = int(0.8 * counter)
    num_test = int(0.2 * counter)

    X_train, X_test = X[:num_train], X[num_train:num_train + num_test]
    y_train, y_test = y[:num_train], y[num_train:num_train + num_test]

    # Normalization, make sure nothing becomes NaN

    # X_mean = np.average(X[:num_train + num_test], axis=0)
    # X_std = np.std(X[:num_train + num_test], axis=0)
    #
    # X_train = (X_train - X_mean) / X_std
    # X_test = (X_test - X_mean) / X_std

    # ----------------------------
    print("Start training...")
    # ----------------------------

    for epoch in xrange(pa.num_epochs):

        # In each epoch, we do a full pass over the training data:
        train_err = 0
        train_acc = 0
        train_batches = 0
        start_time = time.time()
        for batch in iterate_minibatches(X_train,
                                         y_train,
                                         pa.batch_size,
                                         shuffle=True):
            inputs, targets = batch
            err, prob_act = pg_learner.su_train(inputs, targets)
            pg_act = np.argmax(prob_act, axis=1)
            train_err += err
            train_acc += np.sum(pg_act == targets)
            train_batches += 1

        # # And a full pass over the test data:
        test_err = 0
        test_acc = 0
        test_batches = 0
        for batch in iterate_minibatches(X_test,
                                         y_test,
                                         pa.batch_size,
                                         shuffle=False):
            inputs, targets = batch
            err, prob_act = pg_learner.su_test(inputs, targets)
            pg_act = np.argmax(prob_act, axis=1)
            test_err += err
            test_acc += np.sum(pg_act == targets)
            test_batches += 1

        # Then we print the results for this epoch:
        print("Epoch {} of {} took {:.3f}s".format(epoch + 1, pa.num_epochs,
                                                   time.time() - start_time))
        print("  training loss:    \t\t{:.6f}".format(train_err /
                                                      train_batches))
        print("  training accuracy:\t\t{:.2f} %".format(
            train_acc / float(num_train) * 100))
        print("  test loss:        \t\t{:.6f}".format(test_err / test_batches))
        print("  test accuracy:    \t\t{:.2f} %".format(test_acc /
                                                        float(num_test) * 100))

        sys.stdout.flush()

        if epoch % pa.output_freq == 0:

            net_file = open(
                pa.output_filename + '_net_file_' + str(epoch) + '.pkl', 'wb')
            cPickle.dump(pg_learner.return_net_params(), net_file, -1)
            net_file.close()

    print("done")
Example #4
0
def launch(pa, pg_resume=None, render=False, repre='image', end='no_new_job'):

    # ----------------------------
    print("Preparing for workers...")
    # ----------------------------

    pg_learners = []
    envs = []

    nw_len_seqs, nw_size_seqs = job_distribution.generate_sequence_work(
        pa, seed=42)

    for ex in range(pa.num_ex):

        print("-prepare for env-", ex)

        env = environment.Env(pa,
                              nw_len_seqs=nw_len_seqs,
                              nw_size_seqs=nw_size_seqs,
                              render=False,
                              repre=repre,
                              end=end)
        env.seq_no = ex
        envs.append(env)

    print("-prepare for worker-")

    rl = Policy_Network.PolicyGradient(n_actions=pa.network_output_dim,
                                       n_features=pa.network_input_height *
                                       pa.network_input_width,
                                       learning_rate=0.02)
    print("policy network params count: ", rl.get_num_params())

    # pg_learner = pg_network.PGLearner(pa)

    # if pg_resume is not None:
    # net_handle = open(pg_resume, 'rb')
    # net_params = cPickle.load(net_handle)
    # pg_learner.set_net_params(net_params)

    # pg_learners.append(pg_learner)

    if pg_resume is not None:
        rl.load_data(pg_resume)

    # accums = init_accums(pg_learners[pa.batch_size])

    # --------------------------------------
    print("Preparing for reference data...")
    # --------------------------------------

    ref_discount_rews, ref_slow_down = slow_down_cdf.launch(pa,
                                                            pg_resume=None,
                                                            render=False,
                                                            plot=False,
                                                            repre=repre,
                                                            end=end)
    mean_rew_lr_curve = []
    max_rew_lr_curve = []
    slow_down_lr_curve = []

    # --------------------------------------
    print("Start training...")
    # --------------------------------------

    timer_start = time.time()

    for iteration in range(1, pa.num_epochs):

        ex_indices = list(range(pa.num_ex))
        np.random.shuffle(ex_indices)

        all_eprews = []
        eprews = []
        eplens = []
        all_slowdown = []

        eprewlist = []
        eplenlist = []
        slowdownlist = []
        losslist = []

        ex_counter = 0
        for ex in range(pa.num_ex):

            ex_idx = ex_indices[ex]

            eprew, eplen, slowdown, all_ob, all_action, all_adv = get_traj_worker(
                rl, envs[ex_idx], pa)
            eprewlist.append(eprew)
            eplenlist.append(eplen)
            slowdownlist.append(slowdown)
            loss = rl.learn(all_ob, all_action, all_adv)
            losslist.append(loss)

            ex_counter += 1

            if ex_counter >= pa.batch_size or ex == pa.num_ex - 1:

                print("\n\n")

                ex_counter = 0

                # all_eprews.extend([r["all_eprews"] for r in result])

                # eprews.extend(np.concatenate([r["all_eprews"] for r in result]))  # episode total rewards
                # eplens.extend(np.concatenate([r["all_eplens"] for r in result]))  # episode lengths

                # all_slowdown.extend(np.concatenate([r["all_slowdown"] for r in result]))

                # assemble gradients
                # grads = grads_all[0]
                # for i in range(1, len(grads_all)):
                # for j in range(len(grads)):
                # grads[j] += grads_all[i][j]

                # propagate network parameters to others
                # params = pg_learners[pa.batch_size].get_params()

                # rmsprop_updates_outside(grads, params, accums, pa.lr_rate, pa.rms_rho, pa.rms_eps)

                # for i in range(pa.batch_size + 1):
                # pg_learners[i].set_net_params(params)

        timer_end = time.time()

        print("-----------------")
        print("Iteration: \t %i" % iteration)
        print("NumTrajs: \t %i" % len(eprewlist))
        print("NumTimesteps: \t %i" % np.sum(eplenlist))
        print("Loss:     \t %s" % np.mean(losslist))
        print("MaxRew: \t %s" % np.average([np.max(rew) for rew in eprewlist]))
        print("MeanRew: \t %s +- %s" % (np.mean(eprewlist), np.std(eprewlist)))
        print("MeanSlowdown: \t %s" %
              np.mean([np.mean(sd) for sd in slowdownlist]))
        print("MeanLen: \t %s +- %s" % (np.mean(eplenlist), np.std(eplenlist)))
        print("Elapsed time\t %s" % (timer_end - timer_start), "seconds")
        print("-----------------")

        timer_start = time.time()

        max_rew_lr_curve.append(np.average([np.max(rew) for rew in eprewlist]))
        mean_rew_lr_curve.append(np.mean(eprewlist))
        slow_down_lr_curve.append(np.mean([np.mean(sd)
                                           for sd in slowdownlist]))

        if iteration % pa.output_freq == 0:

            rl.save_data(pa.output_filename + '_' + str(iteration))

            pa.unseen = True
            # slow_down_cdf.launch(pa, pa.output_filename + '_' + str(iteration) + '.ckpt',
            # render=False, plot=True, repre=repre, end=end)
            pa.unseen = False
            # test on unseen examples

            plot_lr_curve(pa.output_filename, max_rew_lr_curve,
                          mean_rew_lr_curve, slow_down_lr_curve,
                          ref_discount_rews, ref_slow_down)
def launch(pa, pg_resume=None, render=False, repre='image', end='no_new_job'):

    # ----------------------------
    print("Preparing for workers...")
    # ----------------------------

    # dimension of state space
    # NOTE: we have to flatten the images before sending them into the network...
    state_dim   = pa.network_input_height * pa.network_input_width

    # number of actions
    num_actions = pa.network_output_dim

    # initialize the q networks
    sess = tf.Session()
    optimizer = tf.train.RMSPropOptimizer(learning_rate=0.0001, decay=0.9)
    q_learner = DeepQLearner(session=sess, optimizer=optimizer, q_network=build_q_learner, 
                                    state_dim=state_dim, num_actions=num_actions, discount_factor=pa.discount)

    envs = []

    nw_len_seqs = job_distribution.generate_sequence_work(pa, seed=42)

    ### create sequence of environments for each of the num_ex job sets/sequences
    for ex in xrange(pa.num_ex):

        print "-prepare for env-", ex

        env = environment.Env(pa, nw_len_seqs=nw_len_seqs, render=False,
                              repre=repre, end=end)
        env.seq_no = ex
        envs.append(env)

    # ### generate sequence of NNs for each batch, each of which is a a policy gradient agent
    # for ex in xrange(pa.batch_size + 1):  # last worker for updating the parameters

    #     print "-prepare for worker-", ex

    #     pg_learner = pg_network.PGLearner(pa)

    #     if pg_resume is not None:
    #         net_handle = open(pg_resume, 'rb')
    #         net_params = cPickle.load(net_handle)
    #         pg_learner.set_net_params(net_params)

    #     pg_learners.append(pg_learner)

    # accums = init_accums(pg_learners[pa.batch_size])

    # --------------------------------------
    print("Preparing for reference data...")
    # --------------------------------------

    ref_discount_rews, ref_slow_down = slow_down_cdf.launch(pa, pg_resume=None, render=False, plot=False, repre=repre, end=end)
    mean_rew_lr_curve = []
    max_rew_lr_curve = []
    slow_down_lr_curve = []

    # --------------------------------------
    print("Start training...")
    # --------------------------------------

    timer_start = time.time()

    for iteration in xrange(1, pa.num_epochs):

        ### use a thread for each use manager to share results across threads
        # ps = []  # threads
        # manager = Manager()  # managing return results
        # manager_result = manager.list([])

        ex_indices = range(pa.num_ex)
        np.random.shuffle(ex_indices)

        all_eprews = []
        loss_all = []
        eprews = []
        eplens = []
        all_slowdown = []

        ex_counter = 0

        ### for each jobset
        for ex in xrange(pa.num_ex):

            ex_idx = ex_indices[ex]

            current_env = envs[ex_idx]

            man_result = []
            get_traj_worker(q_learner, current_env, pa, man_result)

            ### evaluate several instances of trajectories for set of PG agents
            # p = Process(target=get_traj_worker,
            #             args=(pg_learners[ex_counter], envs[ex_idx], pa, manager_result, ))
            # ps.append(p)

            ex_counter += 1

            all_eprews.extend([r["all_eprews"] for r in man_result])
            eprews.extend(np.concatenate([r["all_eprews"] for r in man_result]))  # episode total rewards
            eplens.extend(np.concatenate([r["all_eplens"] for r in man_result]))  # episode lengths

            all_slowdown.extend(np.concatenate([r["all_slowdown"] for r in man_result]))
            
            ##

            # if ex_counter >= pa.batch_size or ex == pa.num_ex - 1:

                # print ex, "out of", pa.num_ex

                # ex_counter = 0

                # for p in ps:
                #     p.start()

                # for p in ps:
                #     p.join()

                # result = []  # convert list from shared memory
                # for r in manager_result:
                #     result.append(r)

                # ps = []
                # manager_result = manager.list([])

                # all_ob = concatenate_all_ob_across_examples([r["all_ob"] for r in result], pa)
                # all_action = np.concatenate([r["all_action"] for r in result])
                # all_adv = np.concatenate([r["all_adv"] for r in result])

                # all_eprews.extend([r["all_eprews"] for r in result])

                # eprews.extend(np.concatenate([r["all_eprews"] for r in result]))  # episode total rewards
                # eplens.extend(np.concatenate([r["all_eplens"] for r in result]))  # episode lengths

                # all_slowdown.extend(np.concatenate([r["all_slowdown"] for r in result]))

        # # assemble gradients
        # grads = grads_all[0]
        # for i in xrange(1, len(grads_all)):
        #     for j in xrange(len(grads)):
        #         grads[j] += grads_all[i][j]

        # # propagate network parameters to others
        # params = pg_learners[pa.batch_size].get_params()

        # rmsprop_updates_outside(grads, params, accums, pa.lr_rate, pa.rms_rho, pa.rms_eps)

        # for i in xrange(pa.batch_size + 1):
        #     pg_learners[i].set_net_params(params)

        timer_end = time.time()

        print "-----------------"
        print "Iteration: \t %i" % iteration
        print "NumTrajs: \t %i" % len(eprews)
        print "NumTimesteps: \t %i" % np.sum(eplens)
        # print "Loss:     \t %s" % np.mean(loss_all)
        print "MaxRew: \t %s" % np.average([np.max(rew) for rew in all_eprews])
        print "MeanRew: \t %s +- %s" % (np.mean(eprews), np.std(eprews))
        print "MeanSlowdown: \t %s" % np.mean(all_slowdown)
        print "MeanLen: \t %s +- %s" % (np.mean(eplens), np.std(eplens))
        print "Elapsed time\t %s" % (timer_end - timer_start), "seconds"
        print "-----------------"

        timer_start = time.time()

        max_rew_lr_curve.append(np.average([np.max(rew) for rew in all_eprews]))
        mean_rew_lr_curve.append(np.mean(eprews))
        slow_down_lr_curve.append(np.mean(all_slowdown))

        if iteration % pa.output_freq == 0:
            # param_file = open(pa.output_filename + '_' + str(iteration) + '.pkl', 'wb')
            # cPickle.dump(pg_learners[pa.batch_size].get_params(), param_file, -1)
            # param_file.close()

            pa.unseen = True
            slow_down_cdf.launch(pa, pa.output_filename + '_' + str(iteration) + '.pkl',
                                 render=False, plot=True, repre=repre, end=end, q_resume=q_learner)
            pa.unseen = False
            # test on unseen examples

            plot_lr_curve(pa.output_filename,
                          max_rew_lr_curve, mean_rew_lr_curve, slow_down_lr_curve,
                          ref_discount_rews, ref_slow_down)
Example #6
0
def test(it, pa, pg_resume, workloads, episode_max_length=200):
    repre = 'image'
    end = 'all_done'
    agent = Heuristic_Agents()
    pg_learner = pg_network.PGLearner(pa)
    if pg_resume is not None:
        net_handle = open(pg_resume, 'rb')
        net_params = pickle.load(net_handle)
        pg_learner.set_net_params(net_params)

    new_env = Env1(0, 1)
    job_distribution = Dist()
    nw_len_seqs, nw_size_seqs = job_distribution.generate_sequence_work(
        pa, seed=42)
    logline = str(it) + '\n'
    for ex in range(pa.num_test_ex):
        env = environment.Env(pa,
                              nw_len_seqs=nw_len_seqs,
                              nw_size_seqs=nw_size_seqs,
                              render=False,
                              repre=repre,
                              end=end)
        env.seq_no = ex + pa.num_ex
        new_env.reset()
        new_env.workload_seq = workloads[ex + pa.num_ex]

        new_env.generate_workload()

        print('Testing : ', new_env.workload_seq)
        env.reset()

        obs = []
        new_obs = []
        acts = []
        new_acts = []
        rews = []
        utils = ''
        suffer = []
        new_rews = []
        entropy = []
        finished_episode_len = 0
        crs = [0] * pa.num_machines
        crs_max = [0] * pa.num_machines
        info = []
        new_ob = new_env.observe()
        ob = env.observe()
        counter = 0
        for _ in range(200):

            act_prob = pg_learner.get_one_act_prob(ob)
            a = np.argmax(act_prob)

            act = agent.get_action(new_env, a)

            new_obs.append(new_ob)
            new_acts.append(act)
            new_ob, new_rews, done1, info1 = new_env.step(act, _, new_rews, a)
            #a = (csprob_n > np.random.rand()).argmax()
            #np.set_printoptions(linewidth=40*5, precision = 2, threshold=np.nan)
            #    print('State>>',ob)
            #    print('Action>>',a)
            obs.append(ob)  # store the ob at current decision making step
            acts.append(a)

            ob, rew, done, info = env.step(a, repeat=True)
            counter += 1
            if info == 'Allocation_Success':
                finished_episode_len = _ + 1
        #    print('Reward>>',rew)
            rews.append(rew)
            entropy.append(get_entropy(act_prob))

            if done1: break

            util = ''
            for k, machine in enumerate(new_env.machines):
                if len(machine.running_tasks) > 0:
                    if machine.cpus_left >= 0:
                        util += str(machine.total_cpus -
                                    machine.cpus_left) + ','
                    else:
                        util += str(machine.total_cpus) + ','
                        suffer.append(abs(machine.cpus_left))
                else:
                    util += str(0) + ','

                crs_this_time = [0] * pa.num_machines
                for i in range(len(machine.running_tasks)):
                    for j in range(i + 1, len(machine.running_tasks)):
                        task_i, task_j = machine.running_tasks[
                            i], machine.running_tasks[j]
                        if task_i != task_j:
                            crs[k] += pa.interference_penalty * (
                                task_i.cpu_util[-1] *
                                task_j.cpu_util[-1]) * (-1)
                            crs_this_time[k] += pa.interference_penalty * (
                                task_i.cpu_util[-1] *
                                task_j.cpu_util[-1]) * (-1)
                crs_max[k] = max(crs_max[k], crs_this_time[k])
                #################
            utils += util + '|'

        logline += str(
            str(counter - 1) +
            '|' + str(utils) + str(finished_episode_len)) + '\n' + str(
                sum(new_rews)) + '\n' + str(sum(suffer)) + '\n'
        for i in range(len(new_env.machines)):
            logline += str(crs[i]) + ','
        logline = logline[:-1] + '\n'
        for i in range(len(new_env.machines)):
            logline += str(crs_max[i]) + ','
        logline = logline[:-1]
        logline += '\n'

        print('Iteration number ', it)
        print('Example No:,', ex)
        print('Test Actions : ', new_acts)

        print('Reward : ', new_rews)
        print('Total reward : ', sum(new_rews))
    return logline
Example #7
0
def launch(pa, pg_resume=None, render=False, repre='image', end='no_new_job'):
    task_dist = Task_Dist()
    workloads = task_dist.gen_seq_workload()

    # ----------------------------
    print("Preparing for workers...")
    # ----------------------------
    #logs = open('/home/shanka/logs_packing_deeprm', 'a')
    pg_learners = []
    envs = []
    job_distribution = Dist()
    nw_len_seqs, nw_size_seqs = job_distribution.generate_sequence_work(
        pa, seed=42)

    for ex in range(pa.num_ex):

        print("-prepare for env-", ex)

        env = environment.Env(pa,
                              nw_len_seqs=nw_len_seqs,
                              nw_size_seqs=nw_size_seqs,
                              render=False,
                              repre=repre,
                              end=end)
        env.seq_no = ex
        envs.append(env)

    for ex in range(pa.batch_size +
                    1):  # last worker for updating the parameters

        print("-prepare for worker-", ex)

        pg_learner = pg_network.PGLearner(pa)

        if pg_resume is not None:
            net_handle = open(pg_resume, 'rb')
            net_params = pickle.load(net_handle)
            pg_learner.set_net_params(net_params)

        pg_learners.append(pg_learner)

    accums = init_accums(pg_learners[pa.batch_size])

    # --------------------------------------
    # print("Preparing for reference data...")
    # --------------------------------------
    # print('Start testing...')

    # for ite in range(10,1000,10):
    # 	pg_resume = pa.output_filename +'_'+str(ite)+'.pkl'

    # 	logline=test(ite,pa, pg_resume,workloads,repre)

    # 	logs.write(logline)
    # 	logs.flush()

    # 	os.fsync(logs.fileno())
    # return

    # ref_discount_rews, ref_slow_down = slow_down_cdf.launch(pa, pg_resume=None, render=False, plot=False, repre=repre, end=end)
    # mean_rew_lr_curve = []
    # max_rew_lr_curve = []
    # slow_down_lr_curve = []

    # --------------------------------------
    print("Start training...")
    # --------------------------------------

    timer_start = time.time()

    for iteration in range(1, pa.num_epochs):

        ps = []  # threads
        manager = Manager()  # managing return results
        manager_result = manager.list([])

        ex_indices = range(pa.num_ex)
        #    np.random.shuffle(ex_indices)

        all_eprews = []
        grads_all = []
        loss_all = []
        eprews = []
        eplens = []
        all_slowdown = []
        all_entropy = []

        ex_counter = 0
        for ex in range(pa.num_ex):

            ex_idx = ex_indices[ex]
            p = Process(target=get_traj_worker,
                        args=(
                            pg_learners[ex_counter],
                            envs[ex_idx],
                            pa,
                            manager_result,
                        ))
            ps.append(p)

            ex_counter += 1

            if ex_counter >= pa.batch_size or ex == pa.num_ex - 1:

                print(ex, "out of", pa.num_ex)

                ex_counter = 0

                for p in ps:
                    p.start()

                for p in ps:
                    p.join()

                result = []  # convert list from shared memory
                for r in manager_result:
                    result.append(r)

                ps = []
                manager_result = manager.list([])

                all_ob = concatenate_all_ob_across_examples(
                    [r["all_ob"] for r in result], pa)
                all_action = np.concatenate([r["all_action"] for r in result])
                all_adv = np.concatenate([r["all_adv"] for r in result])

                # Do policy gradient update step, using the first agent
                # put the new parameter in the last 'worker', then propagate the update at the end
                grads = pg_learners[pa.batch_size].get_grad(
                    all_ob, all_action, all_adv)

                grads_all.append(grads)

                all_eprews.extend([r["all_eprews"] for r in result])

                eprews.extend(np.concatenate([r["all_eprews"] for r in result
                                              ]))  # episode total rewards
                eplens.extend(np.concatenate([r["all_eplens"] for r in result
                                              ]))  # episode lengths

        #        all_slowdown.extend(np.concatenate([r["all_slowdown"] for r in result]))
        #        all_entropy.extend(np.concatenate([r["all_entropy"] for r in result]))

        # assemble gradients
        grads = grads_all[0]
        for i in range(1, len(grads_all)):
            for j in range(len(grads)):
                grads[j] += grads_all[i][j]

        # propagate network parameters to others
        params = pg_learners[pa.batch_size].get_params()

        rmsprop_updates_outside(grads, params, accums, pa.lr_rate, pa.rms_rho,
                                pa.rms_eps)

        for i in range(pa.batch_size + 1):
            pg_learners[i].set_net_params(params)

        timer_end = time.time()

        print("-----------------")
        print("Iteration: \t %i" % iteration)
        print("NumTrajs: \t %i" % len(eprews))
        print("NumTimesteps: \t %i" % np.sum(eplens))
        #  print "Loss:     \t %s" % np.mean(loss_all)
        print("MaxRew: \t %s" % np.average([np.max(rew)
                                            for rew in all_eprews]))
        print("MeanRew: \t %s +- %s" % (np.mean(eprews), np.std(eprews)))
        # print "MeanSlowdown: \t %s" % np.mean(all_slowdown)
        print("MeanLen: \t %s +- %s" % (np.mean(eplens), np.std(eplens)))
        #     print "MeanEntropy \t %s" % (np.mean(all_entropy))
        print("Elapsed time\t %s" % (timer_end - timer_start), "seconds")
        print("-----------------")

        #    max_rew_lr_curve.append(np.average([np.max(rew) for rew in all_eprews]))
        #    mean_rew_lr_curve.append(np.mean(eprews))
        #    slow_down_lr_curve.append(np.mean(all_slowdown))

        if iteration % pa.output_freq == 0:
            pg_resume = pa.output_filename + '_' + str(iteration) + '.pkl'
            param_file = open(pg_resume, 'wb')
            pickle.dump(pg_learner.get_params(), param_file, -1)
            param_file.close()
            test(pa, pg_resume, workloads, repre)
Example #8
0
    def __init__(self,
                 pa,
                 nw_len_seqs=None,
                 nw_size_seqs=None,
                 seed=None,
                 render=False,
                 repre='image',
                 end='no_new_job'):

        self.pa = pa
        self.render = render
        self.repre = repre  # image or compact representation
        self.end = end  # termination type, 'no_new_job' or 'all_done'

        # rnn stuff
        if self.pa.rnn:
            self.rnn = tf_dist_rnn_object.dist_rnn(pa)
            self.rnn.train()
        else:
            self.rnn = None

        self.rnn_offset = 0  # used to represent how many things have been generated by the rnn

        self.nw_dist = pa.dist.bi_model_dist

        self.curr_time = 0

        # set up random seed
        if self.pa.unseen:
            np.random.seed(None)
        else:
            np.random.seed(seed)

        if self.pa.rnn:
            ori_simu_len = pa.simu_len
            pa.simu_len = pa.simu_len + self.rnn.SEQ_LEN

        if nw_len_seqs is None or nw_size_seqs is None:
            # generate new work
            self.nw_len_seqs, self.nw_size_seqs = \
                job_distribution.generate_sequence_work(pa, seed=None)

            self.workload = np.zeros(pa.num_res)
            for i in xrange(pa.num_res):
                self.workload[i] = \
                    np.sum(np.reshape(self.nw_size_seqs[:, :, i], self.pa.simu_len * self.pa.num_ex) *
                           np.reshape(self.nw_len_seqs[:, :], self.pa.simu_len * self.pa.num_ex)) / \
                    float(pa.res_slot) / \
                    float(len(self.nw_len_seqs))
                print("Load on # " + str(i) + " resource dimension is " +
                      str(self.workload[i]))

            self.nw_len_seqs = np.reshape(self.nw_len_seqs,
                                          [self.pa.num_ex, self.pa.simu_len])
            self.nw_size_seqs = np.reshape(
                self.nw_size_seqs,
                [self.pa.num_ex, self.pa.simu_len, self.pa.num_res])

        else:
            self.nw_len_seqs = nw_len_seqs
            self.nw_size_seqs = nw_size_seqs

        if self.pa.rnn:
            print(self.nw_size_seqs.shape)
            print(self.nw_len_seqs)
            self.len_seeds_for_rnn = self.nw_len_seqs[:, :self.rnn.SEQ_LEN]
            self.res_seeds_for_rnn = self.nw_size_seqs[:, :self.rnn.SEQ_LEN, :]

            self.nw_len_seqs = self.nw_len_seqs[:, self.rnn.SEQ_LEN:]
            self.nw_size_seqs = self.nw_size_seqs[:, self.rnn.SEQ_LEN:, :]

            pa.simu_len = ori_simu_len

        self.seq_no = 0  # which example sequence
        self.seq_idx = 0  # index in that sequence

        # initialize system
        self.machine = Machine(pa)
        self.job_slot = JobSlot(pa)
        self.job_backlog = JobBacklog(pa)
        self.job_record = JobRecord()
        self.extra_info = ExtraInfo(pa)

        if self.pa.rnn:
            self.seed_rnn()
Example #9
0
def launch(pa, pg_resume=None, render=False, repre='image', end='no_new_job'):

    # ----------------------------
    print("Preparing for workers...")
    # ----------------------------

    pg_learners = []
    envs = []

    nw_len_seqs, nw_size_seqs = job_distribution.generate_sequence_work(
        pa, seed=42)

    for ex in xrange(pa.num_ex):  # number of sequences

        print "-prepare for env-", ex

        env = environment.Env(pa,
                              nw_len_seqs=nw_len_seqs,
                              nw_size_seqs=nw_size_seqs,
                              render=True,
                              repre=repre,
                              end=end)
        env.seq_no = ex
        envs.append(env)

    for ex in xrange(pa.batch_size +
                     1):  # last worker for updating the parameters

        print "-prepare for worker-", ex

        pg_learner = pg_network.PGLearner(pa)

        startIndex = 0
        if pg_resume is not None:
            net_handle = open(pg_resume, 'rb')
            net_params = cPickle.load(net_handle)
            pg_learner.set_net_params(net_params)
            startIndex = re.match(pg_resume, '\d+').group()
            startIndex = int(startIndex)

        pg_learners.append(pg_learner)

    accums = init_accums(pg_learners[pa.batch_size])

    # --------------------------------------
    print("Preparing for reference data...")
    # --------------------------------------

    # Reference examples, get reference discounted rewards and reference slowdown from random, SJF and Tetris algorithms
    ref_discount_rews, ref_slow_down = slow_down_cdf.launch(pa,
                                                            pg_resume=None,
                                                            render=True,
                                                            plot=False,
                                                            repre=repre,
                                                            end=end)
    mean_rew_lr_curve = []
    max_rew_lr_curve = []
    slow_down_lr_curve = []

    # --------------------------------------
    print("Start training...")
    # --------------------------------------

    timer_start = time.time()

    for iteration in xrange(startIndex, pa.num_epochs):

        ps = []  # threads
        manager = Manager()  # managing return results
        manager_result = manager.list([])

        ex_indices = range(pa.num_ex)
        np.random.shuffle(ex_indices)

        all_eprews = []
        grads_all = []
        loss_all = []
        eprews = []
        eplens = []
        all_slowdown = []
        all_entropy = []

        ex_counter = 0
        for ex in xrange(pa.num_ex):

            ex_idx = ex_indices[ex]
            p = Process(target=get_traj_worker,
                        args=(
                            pg_learners[ex_counter],
                            envs[ex_idx],
                            pa,
                            manager_result,
                        ))
            ps.append(p)

            ex_counter += 1
            # append pa.num_ex number of Processes in ps until going inside if
            if ex_counter >= pa.batch_size or ex == pa.num_ex - 1:

                print ex + 1, "out of", pa.num_ex

                ex_counter = 0

                for p in ps:
                    p.start()

                for p in ps:
                    p.join()

                result = []  # convert list from shared memory
                for r in manager_result:
                    result.append(r)

                ps = []
                manager_result = manager.list([])

                all_ob = concatenate_all_ob_across_examples(
                    [r["all_ob"] for r in result], pa)
                all_action = np.concatenate([r["all_action"] for r in result])
                all_adv = np.concatenate([r["all_adv"] for r in result])

                # Do policy gradient update step, using the first agent
                # put the new parameter in the last 'worker', then propagate the update at the end
                grads = pg_learners[pa.batch_size].get_grad(
                    all_ob, all_action, all_adv)  #(states, actions, values)

                grads_all.append(grads)

                all_eprews.extend([r["all_eprews"] for r in result])

                eprews.extend(np.concatenate([r["all_eprews"] for r in result
                                              ]))  # episode total rewards
                eplens.extend(np.concatenate([r["all_eplens"] for r in result
                                              ]))  # episode lengths

                all_slowdown.extend(
                    np.concatenate([r["all_slowdown"] for r in result]))
                all_entropy.extend(
                    np.concatenate([r["all_entropy"] for r in result]))

        # assemble gradients
        grads = grads_all[0]
        for i in xrange(1, len(grads_all)):
            for j in xrange(len(grads)):
                grads[j] += grads_all[i][j]

        # propagate network parameters to others
        params = pg_learners[pa.batch_size].get_params()

        rmsprop_updates_outside(grads, params, accums, pa.lr_rate, pa.rms_rho,
                                pa.rms_eps)

        for i in xrange(pa.batch_size + 1):
            pg_learners[i].set_net_params(params)

        timer_end = time.time()

        print "-----------------"
        print "Iteration: \t %i" % iteration
        print "NumTrajs: \t %i" % len(eprews)
        print "NumTimesteps: \t %i" % np.sum(eplens)
        # print "Loss:     \t %s" % np.mean(loss_all)
        print "MaxRew: \t %s" % np.average([np.max(rew) for rew in all_eprews])
        print "MeanRew: \t %s +- %s" % (np.mean(eprews), np.std(eprews))
        print "MeanSlowdown: \t %s" % np.mean(all_slowdown)
        print "MeanLen: \t %s +- %s" % (np.mean(eplens), np.std(eplens))
        print "MeanEntropy \t %s" % (np.mean(all_entropy))
        print "Elapsed time\t %s" % (timer_end - timer_start), "seconds"
        print "-----------------"

        f = open('log/re_log_' + datetime.now().strftime('%Y-%m-%d_%H:%M:%S'),
                 'w+')
        f.write("-----------------\n")
        f.write("Iteration: \t %i\n" % (iteration))
        f.write("NumTrajs: \t %i\n" % (len(eprews)))
        f.write("NumTimesteps: \t %i\n" % (np.sum(eplens)))
        # f.write("Loss:     \t %s\n".format(loss))
        f.write("MaxRew: \t %s\n" %
                (np.average([np.max(rew) for rew in all_eprews])))
        f.write("MeanRew: \t %s +- %s\n" % (np.mean(eprews), np.std(eprews)))
        f.write("MeanSlowdown: \t %s\n" % (np.mean(all_slowdown)))
        f.write("MeanLen: \t %s +- %s\n" % (np.mean(eplens), np.std(eplens)))
        f.write("MeanEntropy \t %s\n" % ((np.mean(all_entropy))))
        f.write("Elapsed time\t %s seconds\n" % ((timer_end - timer_start)))
        f.write("-----------------\n")
        f.close()

        timer_start = time.time()

        max_rew_lr_curve.append(np.average([np.max(rew)
                                            for rew in all_eprews]))
        mean_rew_lr_curve.append(np.mean(eprews))
        slow_down_lr_curve.append(np.mean(all_slowdown))

        if iteration % pa.output_freq == 0:
            param_file = open(
                pa.output_filename + '_' + str(iteration) + '.pkl', 'wb')
            cPickle.dump(pg_learners[pa.batch_size].get_params(), param_file,
                         -1)
            param_file.close()

            # added by wjchen, to record accuracy and rewards
            sample_file = h5py.File('log/re_record'+str(len(slow_down_lr_curve))\
                                    + datetime.now().strftime('%Y-%m-%d_%H:%M')+'.h5', 'w')
            sample_file.create_dataset('max_rew_lr_curve',
                                       data=max_rew_lr_curve)
            sample_file.create_dataset('mean_rew_lr_curve',
                                       data=mean_rew_lr_curve)
            sample_file.create_dataset('slow_down_lr_curve',
                                       data=slow_down_lr_curve)

            ref_dr = sample_file.create_group('ref_discount_rews')
            for k, v in ref_discount_rews.items():
                ref_dr[k] = np.average(v)

            ref_sd = sample_file.create_group('ref_slow_down')
            for k, v in ref_slow_down.items():
                ref_sd[k] = np.average(np.concatenate(v))
            sample_file.close()
            # print ref_slow_down
            # print ref_discount_rews
            #
            print '\n----Reference Slowdown----'
            for k, v in ref_slow_down.items():
                print "{}: {}".format(k, np.average(np.concatenate(v)))

            print '\n----Reference Discount Reward----'
            for k, v in ref_discount_rews.items():
                print "{}: {}".format(k, np.average(v))

            pa.unseen = True
            slow_down_cdf.launch(pa,
                                 pa.output_filename + '_' + str(iteration) +
                                 '.pkl',
                                 render=True,
                                 plot=True,
                                 repre=repre,
                                 end=end)
            pa.unseen = False
            # test on unseen examples

            plot_lr_curve(pa.output_filename, max_rew_lr_curve,
                          mean_rew_lr_curve, slow_down_lr_curve,
                          ref_discount_rews, ref_slow_down
                          )  # draw average of ref_discount_rews, ref_slow_down
Example #10
0
def launch(pa, pg_resume=None, render=False, repre='image', end='no_new_job'):

    # ----------------------------
    print("Preparing for workers...")
    # ----------------------------

    envs = []

    nw_len_seqs, nw_size_seqs = job_distribution.generate_sequence_work(pa, seed=42)

    for ex in range(pa.num_ex):

        print("-prepare for env-", ex)

        env = environment.Env(pa, nw_len_seqs=nw_len_seqs, nw_size_seqs=nw_size_seqs,
                              render=False, repre=repre, end=end)
        env.seq_no = ex
        envs.append(env)

    print("-prepare for worker-")

    sess = tf.Session()

    actor = actor_critic_brain.Actor(sess, n_features=pa.network_input_height*pa.network_input_width,
                                     n_actions=pa.network_output_dim, lr=0.001)
    critic = actor_critic_brain.Critic(sess, n_features=pa.network_input_height*pa.network_input_width,
                                       lr=0.01)
    sess.run(tf.global_variables_initializer())

    if pg_resume is not None:
        pass
        # rl.load_data(pg_resume)

    # accums = init_accums(pg_learners[pa.batch_size])

    # --------------------------------------
    print("Preparing for reference data...")
    # --------------------------------------

    ref_discount_rews, ref_slow_down = slow_down_cdf.launch(pa, pg_resume=None, render=False,
                                                            plot=False, repre=repre, end=end)
    mean_rew_lr_curve = []
    max_rew_lr_curve = []
    slow_down_lr_curve = []

    # --------------------------------------
    print("Start training...")
    # --------------------------------------

    timer_start = time.time()

    for iteration in range(1, pa.num_epochs):

        ex_indices = list(range(pa.num_ex))
        np.random.shuffle(ex_indices)

        eprewlist = []
        eplenlist =[]
        slowdownlist =[]

        ex_counter = 0
        for ex in range(pa.num_ex):

            ex_idx = ex_indices[ex]

            eprew, eplen, slowdown = get_traj_worker(actor, critic, envs[ex_idx], pa)
            eprewlist.append(eprew)
            eplenlist.append(eplen)
            slowdownlist.append(slowdown)

            ex_counter += 1

            if ex_counter >= pa.batch_size or ex == pa.num_ex - 1:

                print("\n\n")

                ex_counter = 0

        timer_end = time.time()

        print("-----------------")
        print("Iteration: \t %i" % iteration)
        # print("NumTrajs: \t %i" % len(eprewlist))
        print("NumTimesteps: \t %i" % np.sum(eplenlist))
        # print("MaxRew: \t %s" % np.average([np.max(rew) for rew in eprewlist]))
        # print("MeanRew: \t %s +- %s" % (np.mean(eprewlist), np.std(eprewlist)))
        print("MeanSlowdown: \t %s" % np.mean([np.mean(sd) for sd in slowdownlist]))
        print("MeanLen: \t %s +- %s" % (np.mean(eplenlist), np.std(eplenlist)))
        print("Elapsed time\t %s" % (timer_end - timer_start), "seconds")
        print("-----------------")

        timer_start = time.time()

        max_rew_lr_curve.append(np.average([np.max(rew) for rew in eprewlist]))
        mean_rew_lr_curve.append(np.mean(eprewlist))
        slow_down_lr_curve.append(np.mean([np.mean(sd) for sd in slowdownlist]))

        if iteration % pa.output_freq == 0:

            # rl.save_data(pa.output_filename + '_' + str(iteration))

            pa.unseen = True
            # slow_down_cdf.launch(pa, pa.output_filename + '_' + str(iteration) + '.ckpt',
                                # render=False, plot=True, repre=repre, end=end)
            pa.unseen = False
            # test on unseen examples

            plot_lr_curve(pa.output_filename,
                          max_rew_lr_curve, mean_rew_lr_curve, slow_down_lr_curve,
                          ref_discount_rews, ref_slow_down)