Beispiel #1
0
def main():

    pa = parameters.Parameters() # 初始化参数

    type_exp = 'pg_re'  # 'pg_su' 'pg_su_compact' 'v_su', 'pg_v_re', 'pg_re', q_re', 'test'

    pg_resume = None
    v_resume = None
    q_resume = None
    log = None

    render = False

    try:
        opts, args = getopt.getopt(
            sys.argv[1:],
            "hi:o:", ["exp_type=",
                      "num_res=",
                      "num_nw=",
                      "simu_len=",
                      "num_ex=",
                      "num_seq_per_batch=",
                      "eps_max_len=",
                      "num_epochs=",
                      "time_horizon=",
                      "res_slot=",
                      "max_job_len=",
                      "max_job_size=",
                      "new_job_rate=",
                      "dist=",
                      "lr_rate=",
                      "ba_size=",
                      "pg_re=",
                      "v_re=",
                      "q_re=",
                      "out_freq=",
                      "ofile=",
                      "log=",
                      "render=",
                      "unseen="])

    except getopt.GetoptError:
        script_usage()
        sys.exit(2)

    for opt, arg in opts:
        if opt == '-h':
            script_usage()
            sys.exit()
        elif opt in ("-e", "--exp_type"):
            type_exp = arg
        elif opt in ("-n", "--num_res"):
            pa.num_res = int(arg)
        elif opt in ("-w", "--num_nw"):
            pa.num_nw = int(arg)
        elif opt in ("-s", "--simu_len"):
            pa.simu_len = int(arg)
        elif opt in ("-n", "--num_ex"):
            pa.num_ex = int(arg)
        elif opt in ("-sp", "--num_seq_per_batch"):
            pa.num_seq_per_batch = int(arg)
        elif opt in ("-el", "--eps_max_len"):
            pa.episode_max_length = int(arg)
        elif opt in ("-ne", "--num_epochs"):
            pa.num_epochs = int(arg)
        elif opt in ("-t", "--time_horizon"):
            pa.time_horizon = int(arg)
        elif opt in ("-rs", "--res_slot"):
            pa.res_slot = int(arg)
        elif opt in ("-ml", "--max_job_len"):
            pa.max_job_len = int(arg)
        elif opt in ("-ms", "--max_job_size"):
            pa.max_job_size = int(arg)
        elif opt in ("-nr", "--new_job_rate"):
            pa.new_job_rate = float(arg)
        elif opt in ("-d", "--dist"):
            pa.discount = float(arg)
        elif opt in ("-l", "--lr_rate"):
            pa.lr_rate = float(arg)
        elif opt in ("-b", "--ba_size"):
            pa.batch_size = int(arg)
        elif opt in ("-p", "--pg_re"):
            pg_resume = arg
        elif opt in ("-v", "--v_re"):
            v_resume = arg
        elif opt in ("-q", "--q_re"):
            q_resume = arg
        elif opt in ("-f", "--out_freq"):
            pa.output_freq = int(arg)
        elif opt in ("-o", "--ofile"):
            pa.output_filename = arg
        elif opt in ("-lg", "--log"):
            log = arg
        elif opt in ("-r", "--render"):
            render = (arg == 'True')
        elif opt in ("-u", "--unseen"):
            pa.generate_unseen = (arg == 'True')
        else:
            script_usage()
            sys.exit()

    pa.compute_dependent_parameters()

    if type_exp == 'pg_su':
        pg_su.launch(pa, pg_resume, render, repre='image', end='all_done')
    elif type_exp == 'v_su':
        v_su.launch(pa, v_resume, render)
    elif type_exp == 'pg_re':
        pg_re.launch(pa, pg_resume, render, repre='image', end='all_done')
    elif type_exp == 'pg_v_re':
        pg_v_re.launch(pa, pg_resume, v_resume, render)
    elif type_exp == 'test':
        # quick_test.launch(pa, pg_resume, render)
        slow_down_cdf.launch(pa, pg_resume, render, True)
    # elif type_exp == 'q_re':
    #     q_re.launch(pa, q_resume, render)
    else:
        print("Error: unkown experiment type " + str(type_exp))
        exit(1)
Beispiel #2
0
def launch(pa, pg_resume=None, render=False, repre='image', end='no_new_job'):

    # ----------------------------
    print("Preparing for workers...")
    # ----------------------------

    pg_learners = []
    envs = []

    nw_len_seqs, nw_size_seqs = job_distribution.generate_sequence_work(
        pa, seed=42)

    ### create sequence of environments for each of the num_ex job sets/sequences
    for ex in xrange(pa.num_ex):

        print "-prepare for env-", ex

        env = environment.Env(pa,
                              nw_len_seqs=nw_len_seqs,
                              nw_size_seqs=nw_size_seqs,
                              render=False,
                              repre=repre,
                              end=end)
        env.seq_no = ex
        envs.append(env)

    ### generate sequence of NNs for each batch, each of which is a a policy gradient agent
    for ex in xrange(pa.batch_size +
                     1):  # last worker for updating the parameters

        print "-prepare for worker-", ex

        pg_learner = pg_network.PGLearner(pa)

        if pg_resume is not None:
            net_handle = open(pg_resume, 'rb')
            net_params = cPickle.load(net_handle)
            pg_learner.set_net_params(net_params)

        pg_learners.append(pg_learner)

    accums = init_accums(pg_learners[pa.batch_size])

    # --------------------------------------
    print("Preparing for reference data...")
    # --------------------------------------

    ref_discount_rews, ref_slow_down = slow_down_cdf.launch(pa,
                                                            pg_resume=None,
                                                            render=False,
                                                            plot=False,
                                                            repre=repre,
                                                            end=end)
    mean_rew_lr_curve = []
    max_rew_lr_curve = []
    slow_down_lr_curve = []

    # --------------------------------------
    print("Start training...")
    # --------------------------------------

    timer_start = time.time()

    for iteration in xrange(1, pa.num_epochs):

        ### use a thread for each use manager to share results across threads
        ps = []  # threads
        manager = Manager()  # managing return results
        manager_result = manager.list([])

        ex_indices = range(pa.num_ex)
        np.random.shuffle(ex_indices)

        all_eprews = []
        grads_all = []
        loss_all = []
        eprews = []
        eplens = []
        all_slowdown = []
        all_entropy = []

        ex_counter = 0

        ### for each jobset
        for ex in xrange(pa.num_ex):

            ex_idx = ex_indices[ex]
            ### evaluate several instances of trajectories for set of PG agents
            p = Process(target=get_traj_worker,
                        args=(
                            pg_learners[ex_counter],
                            envs[ex_idx],
                            pa,
                            manager_result,
                        ))
            ps.append(p)

            ex_counter += 1

            ##

            if ex_counter >= pa.batch_size or ex == pa.num_ex - 1:

                print ex, "out of", pa.num_ex

                ex_counter = 0

                for p in ps:
                    p.start()

                for p in ps:
                    p.join()

                result = []  # convert list from shared memory
                for r in manager_result:
                    result.append(r)

                ps = []
                manager_result = manager.list([])

                all_ob = concatenate_all_ob_across_examples(
                    [r["all_ob"] for r in result], pa)
                all_action = np.concatenate([r["all_action"] for r in result])
                all_adv = np.concatenate([r["all_adv"] for r in result])

                # Do policy gradient update step, using the first agent
                # put the new parameter in the last 'worker', then propagate the update at the end
                grads = pg_learners[pa.batch_size].get_grad(
                    all_ob, all_action, all_adv)

                grads_all.append(grads)

                all_eprews.extend([r["all_eprews"] for r in result])

                eprews.extend(np.concatenate([r["all_eprews"] for r in result
                                              ]))  # episode total rewards
                eplens.extend(np.concatenate([r["all_eplens"] for r in result
                                              ]))  # episode lengths

                all_slowdown.extend(
                    np.concatenate([r["all_slowdown"] for r in result]))
                all_entropy.extend(
                    np.concatenate([r["all_entropy"] for r in result]))

        # assemble gradients
        grads = grads_all[0]
        for i in xrange(1, len(grads_all)):
            for j in xrange(len(grads)):
                grads[j] += grads_all[i][j]

        # propagate network parameters to others
        params = pg_learners[pa.batch_size].get_params()

        rmsprop_updates_outside(grads, params, accums, pa.lr_rate, pa.rms_rho,
                                pa.rms_eps)

        for i in xrange(pa.batch_size + 1):
            pg_learners[i].set_net_params(params)

        timer_end = time.time()

        print "-----------------"
        print "Iteration: \t %i" % iteration
        print "NumTrajs: \t %i" % len(eprews)
        print "NumTimesteps: \t %i" % np.sum(eplens)
        # print "Loss:     \t %s" % np.mean(loss_all)
        print "MaxRew: \t %s" % np.average([np.max(rew) for rew in all_eprews])
        print "MeanRew: \t %s +- %s" % (np.mean(eprews), np.std(eprews))
        print "MeanSlowdown: \t %s" % np.mean(all_slowdown)
        print "MeanLen: \t %s +- %s" % (np.mean(eplens), np.std(eplens))
        print "MeanEntropy \t %s" % (np.mean(all_entropy))
        print "Elapsed time\t %s" % (timer_end - timer_start), "seconds"
        print "-----------------"

        timer_start = time.time()

        max_rew_lr_curve.append(np.average([np.max(rew)
                                            for rew in all_eprews]))
        mean_rew_lr_curve.append(np.mean(eprews))
        slow_down_lr_curve.append(np.mean(all_slowdown))

        if iteration % pa.output_freq == 0:
            param_file = open(
                pa.output_filename + '_' + str(iteration) + '.pkl', 'wb')
            cPickle.dump(pg_learners[pa.batch_size].get_params(), param_file,
                         -1)
            param_file.close()

            pa.unseen = True
            slow_down_cdf.launch(pa,
                                 pa.output_filename + '_' + str(iteration) +
                                 '.pkl',
                                 render=False,
                                 plot=True,
                                 repre=repre,
                                 end=end)
            pa.unseen = False
            # test on unseen examples

            plot_lr_curve(pa.output_filename, max_rew_lr_curve,
                          mean_rew_lr_curve, slow_down_lr_curve,
                          ref_discount_rews, ref_slow_down)
Beispiel #3
0
def launch(pa, pg_resume=None, render=False, repre='image', end='no_new_job'):

    env = environment.Env(pa, render=render, repre=repre, end=end)

    pg_learner = pg_network.PGLearner(pa)

    if pg_resume is not None:
        net_handle = open(pg_resume, 'rb')
        net_params = cPickle.load(net_handle)
        pg_learner.set_net_params(net_params)

    # ----------------------------
    print("Preparing for data...")
    # ----------------------------

    ref_discount_rews, ref_slow_down = slow_down_cdf.launch(pa,
                                                            pg_resume=None,
                                                            render=False,
                                                            plot=False,
                                                            repre=repre,
                                                            end=end)

    mean_rew_lr_curve = []
    max_rew_lr_curve = []
    slow_down_lr_curve = []

    timer_start = time.time()

    for iteration in range(pa.num_epochs):

        all_ob = []
        all_action = []
        all_adv = []
        all_eprews = []
        all_eplens = []
        all_slowdown = []
        all_entropy = []

        # go through all examples
        for ex in range(pa.num_ex):

            # Collect trajectories until we get timesteps_per_batch total timesteps
            trajs = []

            for i in range(pa.num_seq_per_batch):
                traj = get_traj(pg_learner, env, pa.episode_max_length)
                trajs.append(traj)

            # roll to next example
            env.seq_no = (env.seq_no + 1) % env.pa.num_ex

            all_ob.append(concatenate_all_ob(trajs, pa))

            # Compute discounted sums of rewards
            rets = [discount(traj["reward"], pa.discount) for traj in trajs]
            maxlen = max(len(ret) for ret in rets)
            padded_rets = [
                np.concatenate([ret, np.zeros(maxlen - len(ret))])
                for ret in rets
            ]

            # Compute time-dependent baseline
            baseline = np.mean(padded_rets, axis=0)

            # Compute advantage function
            advs = [ret - baseline[:len(ret)] for ret in rets]
            all_action.append(
                np.concatenate([traj["action"] for traj in trajs]))
            all_adv.append(np.concatenate(advs))

            all_eprews.append(
                np.array([
                    discount(traj["reward"], pa.discount)[0] for traj in trajs
                ]))  # episode total rewards
            all_eplens.append(np.array([len(traj["reward"])
                                        for traj in trajs]))  # episode lengths

            # All Job Stat
            enter_time, finish_time, job_len = process_all_info(trajs)
            finished_idx = (finish_time >= 0)
            all_slowdown.append(
                (finish_time[finished_idx] - enter_time[finished_idx]) /
                job_len[finished_idx])

            # Action prob entropy
            all_entropy.append(np.concatenate([traj["entropy"]]))

        all_ob = concatenate_all_ob_across_examples(all_ob, pa)
        all_action = np.concatenate(all_action)
        all_adv = np.concatenate(all_adv)

        # Do policy gradient update step
        loss = pg_learner.train(all_ob, all_action, all_adv)
        eprews = np.concatenate(all_eprews)  # episode total rewards
        eplens = np.concatenate(all_eplens)  # episode lengths

        all_slowdown = np.concatenate(all_slowdown)

        all_entropy = np.concatenate(all_entropy)

        timer_end = time.time()

        print "-----------------"
        print "Iteration: \t %i" % iteration
        print "NumTrajs: \t %i" % len(eprews)
        print "NumTimesteps: \t %i" % np.sum(eplens)
        print "Loss:     \t %s" % loss
        print "MaxRew: \t %s" % np.average([np.max(rew) for rew in all_eprews])
        print "MeanRew: \t %s +- %s" % (eprews.mean(), eprews.std())
        print "MeanSlowdown: \t %s" % np.mean(all_slowdown)
        print "MeanLen: \t %s +- %s" % (eplens.mean(), eplens.std())
        print "MeanEntropy \t %s" % (np.mean(all_entropy))
        print "Elapsed time\t %s" % (timer_end - timer_start), "seconds"
        print "-----------------"

        timer_start = time.time()

        max_rew_lr_curve.append(np.average([np.max(rew)
                                            for rew in all_eprews]))
        mean_rew_lr_curve.append(eprews.mean())
        slow_down_lr_curve.append(np.mean(all_slowdown))

        if iteration % pa.output_freq == 0:
            param_file = open(
                pa.output_filename + '_' + str(iteration) + '.pkl', 'wb')
            cPickle.dump(pg_learner.get_params(), param_file, -1)
            param_file.close()

            slow_down_cdf.launch(pa,
                                 pa.output_filename + '_' + str(iteration) +
                                 '.pkl',
                                 render=False,
                                 plot=True,
                                 repre=repre,
                                 end=end)

            plot_lr_curve(pa.output_filename, max_rew_lr_curve,
                          mean_rew_lr_curve, slow_down_lr_curve,
                          ref_discount_rews, ref_slow_down)
def launch(pa, pg_resume=None, render=False, repre='image', end='no_new_job'):

    # ----------------------------
    print("Preparing for workers...")
    # ----------------------------

    # dimension of state space
    # NOTE: we have to flatten the images before sending them into the network...
    state_dim   = pa.network_input_height * pa.network_input_width

    # number of actions
    num_actions = pa.network_output_dim

    # initialize the q networks
    sess = tf.Session()
    optimizer = tf.train.RMSPropOptimizer(learning_rate=0.0001, decay=0.9)
    q_learner = DeepQLearner(session=sess, optimizer=optimizer, q_network=build_q_learner, 
                                    state_dim=state_dim, num_actions=num_actions, discount_factor=pa.discount)

    envs = []

    nw_len_seqs = job_distribution.generate_sequence_work(pa, seed=42)

    ### create sequence of environments for each of the num_ex job sets/sequences
    for ex in xrange(pa.num_ex):

        print "-prepare for env-", ex

        env = environment.Env(pa, nw_len_seqs=nw_len_seqs, render=False,
                              repre=repre, end=end)
        env.seq_no = ex
        envs.append(env)

    # ### generate sequence of NNs for each batch, each of which is a a policy gradient agent
    # for ex in xrange(pa.batch_size + 1):  # last worker for updating the parameters

    #     print "-prepare for worker-", ex

    #     pg_learner = pg_network.PGLearner(pa)

    #     if pg_resume is not None:
    #         net_handle = open(pg_resume, 'rb')
    #         net_params = cPickle.load(net_handle)
    #         pg_learner.set_net_params(net_params)

    #     pg_learners.append(pg_learner)

    # accums = init_accums(pg_learners[pa.batch_size])

    # --------------------------------------
    print("Preparing for reference data...")
    # --------------------------------------

    ref_discount_rews, ref_slow_down = slow_down_cdf.launch(pa, pg_resume=None, render=False, plot=False, repre=repre, end=end)
    mean_rew_lr_curve = []
    max_rew_lr_curve = []
    slow_down_lr_curve = []

    # --------------------------------------
    print("Start training...")
    # --------------------------------------

    timer_start = time.time()

    for iteration in xrange(1, pa.num_epochs):

        ### use a thread for each use manager to share results across threads
        # ps = []  # threads
        # manager = Manager()  # managing return results
        # manager_result = manager.list([])

        ex_indices = range(pa.num_ex)
        np.random.shuffle(ex_indices)

        all_eprews = []
        loss_all = []
        eprews = []
        eplens = []
        all_slowdown = []

        ex_counter = 0

        ### for each jobset
        for ex in xrange(pa.num_ex):

            ex_idx = ex_indices[ex]

            current_env = envs[ex_idx]

            man_result = []
            get_traj_worker(q_learner, current_env, pa, man_result)

            ### evaluate several instances of trajectories for set of PG agents
            # p = Process(target=get_traj_worker,
            #             args=(pg_learners[ex_counter], envs[ex_idx], pa, manager_result, ))
            # ps.append(p)

            ex_counter += 1

            all_eprews.extend([r["all_eprews"] for r in man_result])
            eprews.extend(np.concatenate([r["all_eprews"] for r in man_result]))  # episode total rewards
            eplens.extend(np.concatenate([r["all_eplens"] for r in man_result]))  # episode lengths

            all_slowdown.extend(np.concatenate([r["all_slowdown"] for r in man_result]))
            
            ##

            # if ex_counter >= pa.batch_size or ex == pa.num_ex - 1:

                # print ex, "out of", pa.num_ex

                # ex_counter = 0

                # for p in ps:
                #     p.start()

                # for p in ps:
                #     p.join()

                # result = []  # convert list from shared memory
                # for r in manager_result:
                #     result.append(r)

                # ps = []
                # manager_result = manager.list([])

                # all_ob = concatenate_all_ob_across_examples([r["all_ob"] for r in result], pa)
                # all_action = np.concatenate([r["all_action"] for r in result])
                # all_adv = np.concatenate([r["all_adv"] for r in result])

                # all_eprews.extend([r["all_eprews"] for r in result])

                # eprews.extend(np.concatenate([r["all_eprews"] for r in result]))  # episode total rewards
                # eplens.extend(np.concatenate([r["all_eplens"] for r in result]))  # episode lengths

                # all_slowdown.extend(np.concatenate([r["all_slowdown"] for r in result]))

        # # assemble gradients
        # grads = grads_all[0]
        # for i in xrange(1, len(grads_all)):
        #     for j in xrange(len(grads)):
        #         grads[j] += grads_all[i][j]

        # # propagate network parameters to others
        # params = pg_learners[pa.batch_size].get_params()

        # rmsprop_updates_outside(grads, params, accums, pa.lr_rate, pa.rms_rho, pa.rms_eps)

        # for i in xrange(pa.batch_size + 1):
        #     pg_learners[i].set_net_params(params)

        timer_end = time.time()

        print "-----------------"
        print "Iteration: \t %i" % iteration
        print "NumTrajs: \t %i" % len(eprews)
        print "NumTimesteps: \t %i" % np.sum(eplens)
        # print "Loss:     \t %s" % np.mean(loss_all)
        print "MaxRew: \t %s" % np.average([np.max(rew) for rew in all_eprews])
        print "MeanRew: \t %s +- %s" % (np.mean(eprews), np.std(eprews))
        print "MeanSlowdown: \t %s" % np.mean(all_slowdown)
        print "MeanLen: \t %s +- %s" % (np.mean(eplens), np.std(eplens))
        print "Elapsed time\t %s" % (timer_end - timer_start), "seconds"
        print "-----------------"

        timer_start = time.time()

        max_rew_lr_curve.append(np.average([np.max(rew) for rew in all_eprews]))
        mean_rew_lr_curve.append(np.mean(eprews))
        slow_down_lr_curve.append(np.mean(all_slowdown))

        if iteration % pa.output_freq == 0:
            # param_file = open(pa.output_filename + '_' + str(iteration) + '.pkl', 'wb')
            # cPickle.dump(pg_learners[pa.batch_size].get_params(), param_file, -1)
            # param_file.close()

            pa.unseen = True
            slow_down_cdf.launch(pa, pa.output_filename + '_' + str(iteration) + '.pkl',
                                 render=False, plot=True, repre=repre, end=end, q_resume=q_learner)
            pa.unseen = False
            # test on unseen examples

            plot_lr_curve(pa.output_filename,
                          max_rew_lr_curve, mean_rew_lr_curve, slow_down_lr_curve,
                          ref_discount_rews, ref_slow_down)
Beispiel #5
0
def launch(pa, pg_resume=None, render=False, repre='image', end='no_new_job'):

    # ----------------------------
    print("Preparing for workers...")
    # ----------------------------

    pg_learners = []
    envs = []

    nw_len_seqs, nw_size_seqs = job_distribution.generate_sequence_work(
        pa, seed=42)

    for ex in range(pa.num_ex):

        print("-prepare for env-", ex)

        env = environment.Env(pa,
                              nw_len_seqs=nw_len_seqs,
                              nw_size_seqs=nw_size_seqs,
                              render=False,
                              repre=repre,
                              end=end)
        env.seq_no = ex
        envs.append(env)

    print("-prepare for worker-")

    rl = Policy_Network.PolicyGradient(n_actions=pa.network_output_dim,
                                       n_features=pa.network_input_height *
                                       pa.network_input_width,
                                       learning_rate=0.02)
    print("policy network params count: ", rl.get_num_params())

    # pg_learner = pg_network.PGLearner(pa)

    # if pg_resume is not None:
    # net_handle = open(pg_resume, 'rb')
    # net_params = cPickle.load(net_handle)
    # pg_learner.set_net_params(net_params)

    # pg_learners.append(pg_learner)

    if pg_resume is not None:
        rl.load_data(pg_resume)

    # accums = init_accums(pg_learners[pa.batch_size])

    # --------------------------------------
    print("Preparing for reference data...")
    # --------------------------------------

    ref_discount_rews, ref_slow_down = slow_down_cdf.launch(pa,
                                                            pg_resume=None,
                                                            render=False,
                                                            plot=False,
                                                            repre=repre,
                                                            end=end)
    mean_rew_lr_curve = []
    max_rew_lr_curve = []
    slow_down_lr_curve = []

    # --------------------------------------
    print("Start training...")
    # --------------------------------------

    timer_start = time.time()

    for iteration in range(1, pa.num_epochs):

        ex_indices = list(range(pa.num_ex))
        np.random.shuffle(ex_indices)

        all_eprews = []
        eprews = []
        eplens = []
        all_slowdown = []

        eprewlist = []
        eplenlist = []
        slowdownlist = []
        losslist = []

        ex_counter = 0
        for ex in range(pa.num_ex):

            ex_idx = ex_indices[ex]

            eprew, eplen, slowdown, all_ob, all_action, all_adv = get_traj_worker(
                rl, envs[ex_idx], pa)
            eprewlist.append(eprew)
            eplenlist.append(eplen)
            slowdownlist.append(slowdown)
            loss = rl.learn(all_ob, all_action, all_adv)
            losslist.append(loss)

            ex_counter += 1

            if ex_counter >= pa.batch_size or ex == pa.num_ex - 1:

                print("\n\n")

                ex_counter = 0

                # all_eprews.extend([r["all_eprews"] for r in result])

                # eprews.extend(np.concatenate([r["all_eprews"] for r in result]))  # episode total rewards
                # eplens.extend(np.concatenate([r["all_eplens"] for r in result]))  # episode lengths

                # all_slowdown.extend(np.concatenate([r["all_slowdown"] for r in result]))

                # assemble gradients
                # grads = grads_all[0]
                # for i in range(1, len(grads_all)):
                # for j in range(len(grads)):
                # grads[j] += grads_all[i][j]

                # propagate network parameters to others
                # params = pg_learners[pa.batch_size].get_params()

                # rmsprop_updates_outside(grads, params, accums, pa.lr_rate, pa.rms_rho, pa.rms_eps)

                # for i in range(pa.batch_size + 1):
                # pg_learners[i].set_net_params(params)

        timer_end = time.time()

        print("-----------------")
        print("Iteration: \t %i" % iteration)
        print("NumTrajs: \t %i" % len(eprewlist))
        print("NumTimesteps: \t %i" % np.sum(eplenlist))
        print("Loss:     \t %s" % np.mean(losslist))
        print("MaxRew: \t %s" % np.average([np.max(rew) for rew in eprewlist]))
        print("MeanRew: \t %s +- %s" % (np.mean(eprewlist), np.std(eprewlist)))
        print("MeanSlowdown: \t %s" %
              np.mean([np.mean(sd) for sd in slowdownlist]))
        print("MeanLen: \t %s +- %s" % (np.mean(eplenlist), np.std(eplenlist)))
        print("Elapsed time\t %s" % (timer_end - timer_start), "seconds")
        print("-----------------")

        timer_start = time.time()

        max_rew_lr_curve.append(np.average([np.max(rew) for rew in eprewlist]))
        mean_rew_lr_curve.append(np.mean(eprewlist))
        slow_down_lr_curve.append(np.mean([np.mean(sd)
                                           for sd in slowdownlist]))

        if iteration % pa.output_freq == 0:

            rl.save_data(pa.output_filename + '_' + str(iteration))

            pa.unseen = True
            # slow_down_cdf.launch(pa, pa.output_filename + '_' + str(iteration) + '.ckpt',
            # render=False, plot=True, repre=repre, end=end)
            pa.unseen = False
            # test on unseen examples

            plot_lr_curve(pa.output_filename, max_rew_lr_curve,
                          mean_rew_lr_curve, slow_down_lr_curve,
                          ref_discount_rews, ref_slow_down)
def launch(pa, pg_resume=None, render=True, repre='image', end='no_new_job'):

    f = open('log/re_log_' + datetime.now().strftime('%Y-%m-%d_%H:%M:%S'), 'a')

    env = environment.Env(pa, render=render, repre=repre, end=end)

    pg_learner = pg_network.PGLearner(pa)

    startIdx = 0
    if pg_resume is not None:  # and 're' in pg_resume:
        net_handle = open(pg_resume, 'rb')
        net_params = cPickle.load(net_handle)
        pg_learner.set_net_params(net_params)
        tmp = re.match('.+?(\d+).+', pg_resume)
        startIdx = int(tmp.group(1))

    # ----------------------------
    print("\nPreparing for data...")
    # ----------------------------

    ref_discount_rews, ref_slow_down = slow_down_cdf.launch(pa,
                                                            pg_resume=None,
                                                            render=True,
                                                            plot=False,
                                                            repre=repre,
                                                            end=end)

    mean_rew_lr_curve = []
    max_rew_lr_curve = []
    slow_down_lr_curve = []

    timer_start = time.time()

    print("\nStart reinforcement learning...")

    for iteration in xrange(startIdx, pa.num_epochs):

        all_ob = []
        all_action = []
        all_adv = []
        all_eprews = []
        all_eplens = []
        all_slowdown = []
        all_entropy = []

        # go through all examples
        for ex in xrange(pa.num_ex):

            # Collect trajectories until we get timesteps_per_batch total timesteps
            trajs = []

            for i in xrange(pa.num_seq_per_batch):
                traj = get_traj(pg_learner, env, pa.episode_max_length)
                trajs.append(traj)

            # roll to next example
            env.seq_no = (env.seq_no + 1) % env.pa.num_ex

            all_ob.append(concatenate_all_ob(trajs, pa))

            # Compute discounted sums of rewards
            rets = [discount(traj["reward"], pa.discount) for traj in trajs]
            maxlen = max(len(ret) for ret in rets)
            padded_rets = [
                np.concatenate([ret, np.zeros(maxlen - len(ret))])
                for ret in rets
            ]

            # Compute time-dependent baseline
            baseline = np.mean(padded_rets, axis=0)

            # Compute advantage function
            advs = [ret - baseline[:len(ret)] for ret in rets]
            all_action.append(
                np.concatenate([traj["action"] for traj in trajs]))
            all_adv.append(np.concatenate(advs))

            all_eprews.append(
                np.array([
                    discount(traj["reward"], pa.discount)[0] for traj in trajs
                ]))  # episode total rewards
            all_eplens.append(np.array([len(traj["reward"])
                                        for traj in trajs]))  # episode lengths

            # All Job Stat
            enter_time, finish_time, job_len = process_all_info(trajs)
            finished_idx = (finish_time >= 0)
            all_slowdown.append(
                (finish_time[finished_idx] - enter_time[finished_idx]) /
                job_len[finished_idx])

            # Action prob entropy
            all_entropy.append(np.concatenate([traj["entropy"]]))

        all_ob = concatenate_all_ob_across_examples(all_ob, pa)
        all_action = np.concatenate(all_action)
        all_adv = np.concatenate(all_adv)

        # Do policy gradient update step
        loss = pg_learner.train(all_ob, all_action, all_adv)
        eprews = np.concatenate(all_eprews)  # episode total rewards
        eplens = np.concatenate(all_eplens)  # episode lengths

        all_slowdown = np.concatenate(all_slowdown)

        all_entropy = np.concatenate(all_entropy)

        timer_end = time.time()

        print "-----------------"
        print "Iteration: \t %i" % iteration
        print "NumTrajs: \t %i" % len(eprews)
        print "NumTimesteps: \t %i" % np.sum(eplens)
        print "Loss:     \t %s" % loss
        print "MaxRew: \t %s" % np.average([np.max(rew) for rew in all_eprews])
        print "MeanRew: \t %s +- %s" % (eprews.mean(), eprews.std())
        print "MeanSlowdown: \t %s" % np.mean(all_slowdown)
        print "MeanLen: \t %s +- %s" % (eplens.mean(), eplens.std())
        print "MeanEntropy \t %s" % (np.mean(all_entropy))
        print "Elapsed time\t %s" % (timer_end - timer_start), "seconds"
        print "-----------------"

        f.write("-----------------\n")
        f.write("Iteration: \t %i\n" % (iteration))
        f.write("NumTrajs: \t %i\n" % (len(eprews)))
        f.write("NumTimesteps: \t %i\n" % (np.sum(eplens)))
        f.write("Loss:     \t %s\n".format(loss))
        f.write("MaxRew: \t %s\n" %
                (np.average([np.max(rew) for rew in all_eprews])))
        f.write("MeanRew: \t %s +- %s\n" % (np.mean(eprews), np.std(eprews)))
        f.write("MeanSlowdown: \t %s\n" % (np.mean(all_slowdown)))
        f.write("MeanLen: \t %s +- %s\n" % (np.mean(eplens), np.std(eplens)))
        f.write("MeanEntropy \t %s\n" % ((np.mean(all_entropy))))
        f.write("Elapsed time\t %s seconds\n" % ((timer_end - timer_start)))
        f.write("-----------------\n")
        f.close()

        timer_start = time.time()

        max_rew_lr_curve.append(np.average([np.max(rew)
                                            for rew in all_eprews]))
        mean_rew_lr_curve.append(eprews.mean())
        slow_down_lr_curve.append(np.mean(all_slowdown))

        if iteration % pa.output_freq == 0:
            param_file = open(
                pa.output_filename + '_' + str(iteration) + '.pkl', 'wb')
            cPickle.dump(pg_learner.get_params(), param_file, -1)
            param_file.close()

            # added by wjchen, to record accuracy and rewards
            sample_file = h5py.File('log/re_record_iter'+str(len(slow_down_lr_curve))\
                                    + datetime.now().strftime('%Y-%m-%d_%H:%M')+'.h5', 'w+')
            sample_file.create_dataset('max_rew_lr_curve',
                                       data=max_rew_lr_curve)
            sample_file.create_dataset('mean_rew_lr_curve',
                                       data=mean_rew_lr_curve)
            sample_file.create_dataset('slow_down_lr_curve',
                                       data=slow_down_lr_curve)
            #
            ref_dr = sample_file.create_group('ref_discount_rews')
            for k, v in ref_discount_rews.items():
                ref_dr[k] = np.average(v)
            #
            ref_sd = sample_file.create_group('ref_slow_down')
            for k, v in ref_slow_down.items():
                ref_sd[k] = np.average(np.concatenate(v))
            sample_file.close()

            slow_down_cdf.launch(pa,
                                 pa.output_filename + '_' + str(iteration) +
                                 '.pkl',
                                 render=True,
                                 plot=True,
                                 repre=repre,
                                 end=end)

            plot_lr_curve(pa.output_filename, max_rew_lr_curve,
                          mean_rew_lr_curve, slow_down_lr_curve,
                          ref_discount_rews, ref_slow_down)
Beispiel #7
0
import slow_down_cdf
import parameters
import numpy as np
pa = parameters.Parameters()
pa.simu_len = 20
pa.num_ex = 5
ref_rewards, ref_slow_down = slow_down_cdf.launch(pa, render=False)
print '\n---------- Total Discount Rewards ----------'
print 'Random2: ' + str(np.average(ref_rewards['Random2']))
print 'SJF2: ' + str(np.average(ref_rewards['SJF2']))
print 'Packer2: ' + str(np.average(ref_rewards['Packer2']))
print 'Tetris2: ' + str(np.average(ref_rewards['Tetris2']))

# print sd[1]['Random2']
# print np.average(np.concatenate(sd[1]['Random2']))
print '\n---------- Average Job Slowdown ----------'
print 'Random2: ' + str(np.average(np.concatenate(ref_slow_down['Random2'])))
print 'SJF2: ' + str(np.average(np.concatenate(ref_slow_down['SJF2'])))
print 'Packer2: ' + str(np.average(np.concatenate(ref_slow_down['Packer2'])))
print 'Tetris2: ' + str(np.average(np.concatenate(ref_slow_down['Tetris2'])))
Beispiel #8
0
def launch(pa, pg_resume=None, render=False, repre='image', end='no_new_job'):

    # ----------------------------
    print("Preparing for workers...")
    # ----------------------------

    pg_learners = []
    envs = []

    nw_len_seqs, nw_size_seqs = job_distribution.generate_sequence_work(
        pa, seed=42)

    for ex in xrange(pa.num_ex):  # number of sequences

        print "-prepare for env-", ex

        env = environment.Env(pa,
                              nw_len_seqs=nw_len_seqs,
                              nw_size_seqs=nw_size_seqs,
                              render=True,
                              repre=repre,
                              end=end)
        env.seq_no = ex
        envs.append(env)

    for ex in xrange(pa.batch_size +
                     1):  # last worker for updating the parameters

        print "-prepare for worker-", ex

        pg_learner = pg_network.PGLearner(pa)

        startIndex = 0
        if pg_resume is not None:
            net_handle = open(pg_resume, 'rb')
            net_params = cPickle.load(net_handle)
            pg_learner.set_net_params(net_params)
            startIndex = re.match(pg_resume, '\d+').group()
            startIndex = int(startIndex)

        pg_learners.append(pg_learner)

    accums = init_accums(pg_learners[pa.batch_size])

    # --------------------------------------
    print("Preparing for reference data...")
    # --------------------------------------

    # Reference examples, get reference discounted rewards and reference slowdown from random, SJF and Tetris algorithms
    ref_discount_rews, ref_slow_down = slow_down_cdf.launch(pa,
                                                            pg_resume=None,
                                                            render=True,
                                                            plot=False,
                                                            repre=repre,
                                                            end=end)
    mean_rew_lr_curve = []
    max_rew_lr_curve = []
    slow_down_lr_curve = []

    # --------------------------------------
    print("Start training...")
    # --------------------------------------

    timer_start = time.time()

    for iteration in xrange(startIndex, pa.num_epochs):

        ps = []  # threads
        manager = Manager()  # managing return results
        manager_result = manager.list([])

        ex_indices = range(pa.num_ex)
        np.random.shuffle(ex_indices)

        all_eprews = []
        grads_all = []
        loss_all = []
        eprews = []
        eplens = []
        all_slowdown = []
        all_entropy = []

        ex_counter = 0
        for ex in xrange(pa.num_ex):

            ex_idx = ex_indices[ex]
            p = Process(target=get_traj_worker,
                        args=(
                            pg_learners[ex_counter],
                            envs[ex_idx],
                            pa,
                            manager_result,
                        ))
            ps.append(p)

            ex_counter += 1
            # append pa.num_ex number of Processes in ps until going inside if
            if ex_counter >= pa.batch_size or ex == pa.num_ex - 1:

                print ex + 1, "out of", pa.num_ex

                ex_counter = 0

                for p in ps:
                    p.start()

                for p in ps:
                    p.join()

                result = []  # convert list from shared memory
                for r in manager_result:
                    result.append(r)

                ps = []
                manager_result = manager.list([])

                all_ob = concatenate_all_ob_across_examples(
                    [r["all_ob"] for r in result], pa)
                all_action = np.concatenate([r["all_action"] for r in result])
                all_adv = np.concatenate([r["all_adv"] for r in result])

                # Do policy gradient update step, using the first agent
                # put the new parameter in the last 'worker', then propagate the update at the end
                grads = pg_learners[pa.batch_size].get_grad(
                    all_ob, all_action, all_adv)  #(states, actions, values)

                grads_all.append(grads)

                all_eprews.extend([r["all_eprews"] for r in result])

                eprews.extend(np.concatenate([r["all_eprews"] for r in result
                                              ]))  # episode total rewards
                eplens.extend(np.concatenate([r["all_eplens"] for r in result
                                              ]))  # episode lengths

                all_slowdown.extend(
                    np.concatenate([r["all_slowdown"] for r in result]))
                all_entropy.extend(
                    np.concatenate([r["all_entropy"] for r in result]))

        # assemble gradients
        grads = grads_all[0]
        for i in xrange(1, len(grads_all)):
            for j in xrange(len(grads)):
                grads[j] += grads_all[i][j]

        # propagate network parameters to others
        params = pg_learners[pa.batch_size].get_params()

        rmsprop_updates_outside(grads, params, accums, pa.lr_rate, pa.rms_rho,
                                pa.rms_eps)

        for i in xrange(pa.batch_size + 1):
            pg_learners[i].set_net_params(params)

        timer_end = time.time()

        print "-----------------"
        print "Iteration: \t %i" % iteration
        print "NumTrajs: \t %i" % len(eprews)
        print "NumTimesteps: \t %i" % np.sum(eplens)
        # print "Loss:     \t %s" % np.mean(loss_all)
        print "MaxRew: \t %s" % np.average([np.max(rew) for rew in all_eprews])
        print "MeanRew: \t %s +- %s" % (np.mean(eprews), np.std(eprews))
        print "MeanSlowdown: \t %s" % np.mean(all_slowdown)
        print "MeanLen: \t %s +- %s" % (np.mean(eplens), np.std(eplens))
        print "MeanEntropy \t %s" % (np.mean(all_entropy))
        print "Elapsed time\t %s" % (timer_end - timer_start), "seconds"
        print "-----------------"

        f = open('log/re_log_' + datetime.now().strftime('%Y-%m-%d_%H:%M:%S'),
                 'w+')
        f.write("-----------------\n")
        f.write("Iteration: \t %i\n" % (iteration))
        f.write("NumTrajs: \t %i\n" % (len(eprews)))
        f.write("NumTimesteps: \t %i\n" % (np.sum(eplens)))
        # f.write("Loss:     \t %s\n".format(loss))
        f.write("MaxRew: \t %s\n" %
                (np.average([np.max(rew) for rew in all_eprews])))
        f.write("MeanRew: \t %s +- %s\n" % (np.mean(eprews), np.std(eprews)))
        f.write("MeanSlowdown: \t %s\n" % (np.mean(all_slowdown)))
        f.write("MeanLen: \t %s +- %s\n" % (np.mean(eplens), np.std(eplens)))
        f.write("MeanEntropy \t %s\n" % ((np.mean(all_entropy))))
        f.write("Elapsed time\t %s seconds\n" % ((timer_end - timer_start)))
        f.write("-----------------\n")
        f.close()

        timer_start = time.time()

        max_rew_lr_curve.append(np.average([np.max(rew)
                                            for rew in all_eprews]))
        mean_rew_lr_curve.append(np.mean(eprews))
        slow_down_lr_curve.append(np.mean(all_slowdown))

        if iteration % pa.output_freq == 0:
            param_file = open(
                pa.output_filename + '_' + str(iteration) + '.pkl', 'wb')
            cPickle.dump(pg_learners[pa.batch_size].get_params(), param_file,
                         -1)
            param_file.close()

            # added by wjchen, to record accuracy and rewards
            sample_file = h5py.File('log/re_record'+str(len(slow_down_lr_curve))\
                                    + datetime.now().strftime('%Y-%m-%d_%H:%M')+'.h5', 'w')
            sample_file.create_dataset('max_rew_lr_curve',
                                       data=max_rew_lr_curve)
            sample_file.create_dataset('mean_rew_lr_curve',
                                       data=mean_rew_lr_curve)
            sample_file.create_dataset('slow_down_lr_curve',
                                       data=slow_down_lr_curve)

            ref_dr = sample_file.create_group('ref_discount_rews')
            for k, v in ref_discount_rews.items():
                ref_dr[k] = np.average(v)

            ref_sd = sample_file.create_group('ref_slow_down')
            for k, v in ref_slow_down.items():
                ref_sd[k] = np.average(np.concatenate(v))
            sample_file.close()
            # print ref_slow_down
            # print ref_discount_rews
            #
            print '\n----Reference Slowdown----'
            for k, v in ref_slow_down.items():
                print "{}: {}".format(k, np.average(np.concatenate(v)))

            print '\n----Reference Discount Reward----'
            for k, v in ref_discount_rews.items():
                print "{}: {}".format(k, np.average(v))

            pa.unseen = True
            slow_down_cdf.launch(pa,
                                 pa.output_filename + '_' + str(iteration) +
                                 '.pkl',
                                 render=True,
                                 plot=True,
                                 repre=repre,
                                 end=end)
            pa.unseen = False
            # test on unseen examples

            plot_lr_curve(pa.output_filename, max_rew_lr_curve,
                          mean_rew_lr_curve, slow_down_lr_curve,
                          ref_discount_rews, ref_slow_down
                          )  # draw average of ref_discount_rews, ref_slow_down
def launch(pa, pg_resume=None, render=False, repre='image', end='no_new_job'):

    # ----------------------------
    print("Preparing for workers...")
    # ----------------------------

    envs = []

    nw_len_seqs, nw_size_seqs = job_distribution.generate_sequence_work(pa, seed=42)

    for ex in range(pa.num_ex):

        print("-prepare for env-", ex)

        env = environment.Env(pa, nw_len_seqs=nw_len_seqs, nw_size_seqs=nw_size_seqs,
                              render=False, repre=repre, end=end)
        env.seq_no = ex
        envs.append(env)

    print("-prepare for worker-")

    sess = tf.Session()

    actor = actor_critic_brain.Actor(sess, n_features=pa.network_input_height*pa.network_input_width,
                                     n_actions=pa.network_output_dim, lr=0.001)
    critic = actor_critic_brain.Critic(sess, n_features=pa.network_input_height*pa.network_input_width,
                                       lr=0.01)
    sess.run(tf.global_variables_initializer())

    if pg_resume is not None:
        pass
        # rl.load_data(pg_resume)

    # accums = init_accums(pg_learners[pa.batch_size])

    # --------------------------------------
    print("Preparing for reference data...")
    # --------------------------------------

    ref_discount_rews, ref_slow_down = slow_down_cdf.launch(pa, pg_resume=None, render=False,
                                                            plot=False, repre=repre, end=end)
    mean_rew_lr_curve = []
    max_rew_lr_curve = []
    slow_down_lr_curve = []

    # --------------------------------------
    print("Start training...")
    # --------------------------------------

    timer_start = time.time()

    for iteration in range(1, pa.num_epochs):

        ex_indices = list(range(pa.num_ex))
        np.random.shuffle(ex_indices)

        eprewlist = []
        eplenlist =[]
        slowdownlist =[]

        ex_counter = 0
        for ex in range(pa.num_ex):

            ex_idx = ex_indices[ex]

            eprew, eplen, slowdown = get_traj_worker(actor, critic, envs[ex_idx], pa)
            eprewlist.append(eprew)
            eplenlist.append(eplen)
            slowdownlist.append(slowdown)

            ex_counter += 1

            if ex_counter >= pa.batch_size or ex == pa.num_ex - 1:

                print("\n\n")

                ex_counter = 0

        timer_end = time.time()

        print("-----------------")
        print("Iteration: \t %i" % iteration)
        # print("NumTrajs: \t %i" % len(eprewlist))
        print("NumTimesteps: \t %i" % np.sum(eplenlist))
        # print("MaxRew: \t %s" % np.average([np.max(rew) for rew in eprewlist]))
        # print("MeanRew: \t %s +- %s" % (np.mean(eprewlist), np.std(eprewlist)))
        print("MeanSlowdown: \t %s" % np.mean([np.mean(sd) for sd in slowdownlist]))
        print("MeanLen: \t %s +- %s" % (np.mean(eplenlist), np.std(eplenlist)))
        print("Elapsed time\t %s" % (timer_end - timer_start), "seconds")
        print("-----------------")

        timer_start = time.time()

        max_rew_lr_curve.append(np.average([np.max(rew) for rew in eprewlist]))
        mean_rew_lr_curve.append(np.mean(eprewlist))
        slow_down_lr_curve.append(np.mean([np.mean(sd) for sd in slowdownlist]))

        if iteration % pa.output_freq == 0:

            # rl.save_data(pa.output_filename + '_' + str(iteration))

            pa.unseen = True
            # slow_down_cdf.launch(pa, pa.output_filename + '_' + str(iteration) + '.ckpt',
                                # render=False, plot=True, repre=repre, end=end)
            pa.unseen = False
            # test on unseen examples

            plot_lr_curve(pa.output_filename,
                          max_rew_lr_curve, mean_rew_lr_curve, slow_down_lr_curve,
                          ref_discount_rews, ref_slow_down)