Exemple #1
0
def train(num=1000):
    agent = PolicyGradient(env.observation_space.shape[0], env.action_space.n)

    # agent.load_model()
    steps = []
    outputs = []

    states = []
    actions = []
    rewards = []

    for i_episode in range(num):
        old_observation = env.reset()
        old_action = agent.get_action(
            np.reshape(old_observation, [1, env.observation_space.shape[0]]))

        step = 0
        while True:

            step = step + 1
            # env.render()
            observation, reward, done, info = env.step(old_action)

            states.append(old_observation)
            actions.append(old_action)
            rewards.append(reward)

            old_observation = observation
            old_action = agent.get_action(
                np.reshape(observation, [1, env.observation_space.shape[0]]))

            if step > 50000:
                steps.append(step)
                break

            if done:
                print("{}:{} steps: {}".format(i_episode, step, reward))
                agent.train(np.array(rewards), np.array(actions),
                            np.array(states))
                steps.append(step)
                agent.save_model()
                break
        # if the average steps of consecutive 100 games is lower than a standard
        # we consider the method passes the game

        score = sum(steps[-100:]) / 100
        if len(steps) >= 100 and score < 275:
            print(
                "---------------------------------------------------------------"
            )
            print("done")
            break
def train(num=2000):
    agent = PolicyGradient(env.observation_space.shape[0], env.action_space.n)

    # agent.load_model()
    steps = []
    for i_episode in range(num):
        old_observation = env.reset()
        old_action = agent.get_action(
            np.reshape(old_observation, [1, env.observation_space.shape[0]]))
        done = False
        step = 0

        states = []
        actions = []
        rewards = []

        while not done:
            step = step + 1
            # env.render()
            observation, reward, done, info = env.step(old_action)

            states.append(old_observation)
            actions.append(old_action)
            rewards.append(reward)

            old_observation = observation
            old_action = agent.get_action(
                np.reshape(old_observation,
                           [1, env.observation_space.shape[0]]))

            if done:
                steps.append(step)
                print("{}:{} steps".format(i_episode, step))
                agent.train(np.array(rewards), np.array(actions),
                            np.array(states))
                agent.save_model()
                break

        # if the average steps of consecutive 100 games is lower than a standard
        # we consider the method passes the game
        if len(steps) > 200 and sum(steps[-200:]) / 200 >= 195:
            print(sum(steps[-200:]) / 200)
            break
    def __init__(self, path_name, surffix, path_surffix):
        """
        parameters set
        """
        self.NUM_NODES = params['number of nodes in the cluster']
        self.NUM_APPS = 7
        # self.NUM_CONTAINERS = params['number of containers']

        # self.sim = Simulator()
        # self.env = LraClusterEnv(num_nodes=self.NUM_NODES)

        ckpt_path_1 = path_surffix + path_name + "_1" + "/model.ckpt"
        ckpt_path_2 = path_surffix + path_name + "_2" + "/model.ckpt"
        ckpt_path_3 = path_surffix + path_name + "_3" + "/model.ckpt"
        self.nodes_per_group = int(params['nodes per group'])
        # self.number_of_node_groups = int(self.NUM_NODES / self.nodes_per_group)
        """
        Build Network
        """
        self.n_actions = self.nodes_per_group  #: 3 nodes per group
        self.n_features = int(self.n_actions * self.NUM_APPS + 1 +
                              self.NUM_APPS)  #: 29

        self.RL_1 = PolicyGradient(n_actions=self.n_actions,
                                   n_features=self.n_features,
                                   learning_rate=params['learning rate'],
                                   suffix=surffix + '1')

        self.RL_2 = PolicyGradient(n_actions=self.n_actions,
                                   n_features=self.n_features,
                                   learning_rate=params['learning rate'],
                                   suffix=surffix + '2')

        self.RL_3 = PolicyGradient(n_actions=self.n_actions,
                                   n_features=self.n_features,
                                   learning_rate=params['learning rate'],
                                   suffix=surffix + '3')

        self.RL_1.restore_session(ckpt_path_1)
        self.RL_2.restore_session(ckpt_path_2)
        self.RL_3.restore_session(ckpt_path_3)
Exemple #4
0
import gym
from PolicyGradient import PolicyGradient
import matplotlib.pyplot as plt

env = gym.make('MountainCar-v0')
env.seed(1)
env = env.unwrapped
RENDER = False

RL = PolicyGradient(n_actions=env.action_space.n,
                    n_features=env.observation_space.shape[0],
                    learning_rate=0.02,
                    reward_decay=0.995,
                    print_graph=True)

total_steps = 0

for i_episode in range(1000):

    observation = env.reset()

    while True:
        if RENDER:
            env.render()

        action = RL.choose_action(observation)

        observation_, reward, done, info = env.step(action)

        RL.store_transition(observation, action, reward)
Exemple #5
0
    def __init__(self,
                 trains,
                 validates,
                 tests,
                 n_feature,
                 n_action,
                 name_data,
                 metric,
                 Zero_is_Action,
                 hidden_layers,
                 train_eval_func,
                 test_eval_func,
                 reward_design='action_depend_baseline',
                 model_base=None,
                 validate_max_steps=1000,
                 size_bag=5000,
                 parallel=False,
                 learner=None,
                 sess=None,
                 coor=None,
                 n_bags=1,
                 isLoad=False):
        self.name_data = name_data
        self.label = 'visit'
        self.n_action = n_action
        self.n_feature = n_feature
        self.Zero_is_Action = Zero_is_Action

        self.isLoad = isLoad
        # self.model_save
        self.saved_model_dir = None

        self.trains = trains
        self.tests = tests
        self.validates = validates
        self.treatment_weight = [1.0]
        self.treatment_keys = ['reward']
        # if self.isLoad is True:
        #     self.sess = tf.Session()
        #
        #     ans = self.load_model(
        #         self.sess, self.trains[0][0].reshape((-1, 8)))
        #     print('ans', ans)
        #     print('..', self.trains[0][0])
        #     exit()
        #
        # else:
        if self.Zero_is_Action is True:
            self.learner = PolicyGradient(
                n_action=self.n_action,
                n_feature=self.n_feature,
                hidden_layers=hidden_layers)
        else:
            self.learner = PolicyGradient(
                n_action=self.n_action - 1,
                n_feature=self.n_feature,
                hidden_layers=hidden_layers)
        self.sess = self.learner.sess

        self.n_train = len(self.trains)
        self.n_test = len(self.tests)
        self.n_validate = len(self.validates)
        self.max_epoch = 1000000
        self.validate_max_steps = validate_max_steps
        self.n_bags = n_bags
        self.output_steps = 10
        self.parallel = parallel
        self.batch_size = 512
        self.size_bag = size_bag
        self.train_eval_func = train_eval_func
        self.test_eval_func = test_eval_func
        self.n_step_repeat = n_bags
        self.metric = metric
        self.reward_design = reward_design
        self.start_time = time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())

        # self.saved_model_dir
        # os.mknod("test.txt")
        # self.log_file =
        self.log = open(
            'logs/log_' + time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime()), 'a')

        print('train eval_func', getattr(self.train_eval_func, '__name__'))
        print('test eval_func', getattr(self.test_eval_func, '__name__'))
Exemple #6
0
class RLift:
    def __init__(self,
                 trains,
                 validates,
                 tests,
                 n_feature,
                 n_action,
                 name_data,
                 metric,
                 Zero_is_Action,
                 hidden_layers,
                 train_eval_func,
                 test_eval_func,
                 reward_design='action_depend_baseline',
                 model_base=None,
                 validate_max_steps=1000,
                 size_bag=5000,
                 parallel=False,
                 learner=None,
                 sess=None,
                 coor=None,
                 n_bags=1,
                 isLoad=False):
        self.name_data = name_data
        self.label = 'visit'
        self.n_action = n_action
        self.n_feature = n_feature
        self.Zero_is_Action = Zero_is_Action

        self.isLoad = isLoad
        # self.model_save
        self.saved_model_dir = None

        self.trains = trains
        self.tests = tests
        self.validates = validates
        self.treatment_weight = [1.0]
        self.treatment_keys = ['reward']
        # if self.isLoad is True:
        #     self.sess = tf.Session()
        #
        #     ans = self.load_model(
        #         self.sess, self.trains[0][0].reshape((-1, 8)))
        #     print('ans', ans)
        #     print('..', self.trains[0][0])
        #     exit()
        #
        # else:
        if self.Zero_is_Action is True:
            self.learner = PolicyGradient(
                n_action=self.n_action,
                n_feature=self.n_feature,
                hidden_layers=hidden_layers)
        else:
            self.learner = PolicyGradient(
                n_action=self.n_action - 1,
                n_feature=self.n_feature,
                hidden_layers=hidden_layers)
        self.sess = self.learner.sess

        self.n_train = len(self.trains)
        self.n_test = len(self.tests)
        self.n_validate = len(self.validates)
        self.max_epoch = 1000000
        self.validate_max_steps = validate_max_steps
        self.n_bags = n_bags
        self.output_steps = 10
        self.parallel = parallel
        self.batch_size = 512
        self.size_bag = size_bag
        self.train_eval_func = train_eval_func
        self.test_eval_func = test_eval_func
        self.n_step_repeat = n_bags
        self.metric = metric
        self.reward_design = reward_design
        self.start_time = time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())

        # self.saved_model_dir
        # os.mknod("test.txt")
        # self.log_file =
        self.log = open(
            'logs/log_' + time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime()), 'a')

        print('train eval_func', getattr(self.train_eval_func, '__name__'))
        print('test eval_func', getattr(self.test_eval_func, '__name__'))
        # self.start_state = [np.zeros_like(self.trains[0][0]), 0, 0]
        # self.end_state = [np.ones_like(self.trains[0][0]), 0, 0]

        # print('train', self.trains)
        # print('test', self.tests)
        # print('train', self.trains)

    def split_bags(self):
        bags = [[]] * self.n_bags
        self.n_train = len(self.trains)
        indexs = np.arange(self.n_train)
        np.random.shuffle(indexs)
        for i in range(self.n_bags):
            bags[i] = [self.trains[j]
                       for j in range(self.n_train) if (j % self.n_bags) == i]
        return bags

    def next_batch(self):
        '''
        Return:
        batch = [state, real_action, response]
        '''
        indexs = np.arange(self.n_train)
        np.random.shuffle(indexs)
        batch = [self.trains[i] for i in indexs[:self.size_bag]]
        return batch

    def stat_actions(self, actions):
        p = [np.sum(actions == i) for i in
             range(self.n_action)]
        p = np.array(p)
        p = p / np.sum(p)
        return p

    def train(self):
        uplift_validate_max = -1
        max_result_record = None
        uplift_train = -1
        uplift_train_actions = [-1] * self.n_action
        validate_max_steps = self.validate_max_steps
        validate_cur_steps = 0
        for epoch in range(self.max_epoch):
            self.log.write('Epoch' + str(epoch) + '\n')
            bag_rewards = np.zeros((self.n_step_repeat,))
            trans = []
            for eid in range(self.n_step_repeat):
                bag = self.next_batch()
                datas = np.array([data[0] for data in bag])
                # lifts = data[4]
                actions, probs = self.learner.choose_action(
                    datas, mode='random', greedy=0.1)

                if self.Zero_is_Action is False:
                    actions = [x + 1 for x in actions]
                    tmp_probs = []
                    for prob in probs:
                        tmp = [0.0]
                        tmp.extend(prob.tolist())
                        tmp_probs.append(tmp)
                    probs = np.array(tmp_probs)
                records = []
                real_probs = np.ones(self.n_action) * 0.2
                for a, data, p in zip(actions, bag, probs):
                    # Record: [Algo Action, Real Action, {Reaction}, Prob_sample, Prob_Algo]
                    records.append(
                        [int(a), data[1], {'reward': data[2]}, real_probs, p])
                    # exit()
                # record = [[a, bag[i][1], bag[i][2], probs[i][1]]
                #           for i, a in enumerate(actions)]
                # if self.metric == 'same_diff':
                # bag_rewards[eid], lifts_actions, pro_actions, lift_treatment, algo_treatment, algo_control, algo_treatment = self.eval_func(
                #     record=records, n_action=self.n_action)
                # elif self.metric == 'qini':
                # bag_rewards_qini[eid] = qini_Q(
                #     record=record, n_action=self.n_action)
                eval_res = self.train_eval_func(records=records, treatment_weight=self.treatment_weight,
                                                treatment_keys=self.treatment_keys, n_action=self.n_action)

                bag_rewards[eid] = eval_res['reward']
                algo_treatment = eval_res['response']
                algo_control = eval_res['control']
                # variance = eval_res['variance']
                algo_probs = eval_res['algo_action_prob']
                # print('variance', variance)
                algo_action_base = eval_res['algo_action_base']
                # print('eval_res', eval_res['reward'], eval_res['response'],
                #       eval_res['control'], eval_res['algo_action_prob'])
                tran = Transition()
                # print('actions', actions)
                for i, (data, algo_prob) in enumerate(zip(bag, probs)):
                    feature = data[0]
                    algo_action = actions[i]
                    real_action = data[1]
                    response = data[2]
                    lifts = data[4]

                    val_next = bag_rewards[eid]

                    if self.reward_design == 'UMG':
                        # print('reward_design is same diff')
                        # print('algo_action', algo_action, 'real_action', real_action)
                        if algo_action == real_action or real_action == 0:
                            if algo_action == real_action:
                                rwd = (response - algo_control) + val_next
                            elif real_action == 0:
                                rwd = -(response - algo_control) + val_next

                            if self.Zero_is_Action is False:
                                algo_action -= 1

                            tran.append(
                                state=data[0], real_action=real_action, algo_action=algo_action, reward=rwd)

                    elif self.reward_design == 'action_depend_baseline':
                        # if algo_action == real_action or real_action == 0:
                            # print('algo_action_base[0]', algo_action_base[0])
                        if algo_action == real_action or real_action == 0:
                            if algo_action == real_action:
                                rwd = (
                                    response - algo_action_base[0][algo_action]) + val_next
                            elif real_action == 0:
                                rwd = - \
                                    (response -
                                     algo_action_base[0][algo_action]) + val_next

                            if self.Zero_is_Action is False:
                                algo_action -= 1
                            tran.append(
                                state=data[0], real_action=real_action, algo_action=algo_action, reward=rwd)
                    else:
                        print('Error! Reward Design is not found!')
                        exit()

                trans.append(tran)
            # print('bag_rewards', bag_rewards)
            reward_mean = np.mean(bag_rewards).astype(np.floating)
            reward_std = 1
            print('mean', reward_mean, 'std', max(1e-4, reward_std))
            self.log.write('mean:' + str(reward_mean) +
                           ' std:' + str(max(1e-4, reward_std)) + '\n')
            for tran in trans:
                tran.avg_reward(reward_mean, reward_std)
                self.learner.learn(tran)
            # if we only use RLift, then it need to record the best result by itself.
            # if self.parallel is False:
            eval_res = self.test(eval_func=self.test_eval_func,
                                 datas=self.validates, epoch=epoch, result_output=False)

            print('validate eval_res', eval_res['reward'], eval_res['response'],
                  eval_res['control'], eval_res['algo_action_prob'], eval_res['algo_action_nums'])
            uplift_validate = eval_res['reward']

            print('uplift_validate', uplift_validate)
            if uplift_validate > uplift_validate_max:
                uplift_validate_max = uplift_validate
                if self.saved_model_dir is not None:
                    self.save_model(sess=self.learner.sess)
                    # exit()

                print('uplift_validate_max', uplift_validate_max)
                self.log.write('uplift_validate_max:' +
                               str(uplift_validate_max) + '\n')
                validate_cur_steps = 0
                max_result_record = self.results_calc(eval_func=self.test_eval_func,
                                                      epoch=epoch, result_output=True,
                                                      outputs_list=['test'],
                                                      isMax=True)
                print('max result test')
                uplift_test_max = max_result_record['test']
                # print('uplift_test_max', uplift_test_max)
                print('tests', uplift_test_max['reward'], uplift_test_max['reward'],
                      uplift_test_max['response'], uplift_test_max['control'], uplift_test_max['algo_action_prob'])
                print('max result train')
                # uplift_test_max = max_result_record['train']
                # # print('uplift_test_max', uplift_test_max)
                # print('trains', uplift_test_max['reward'], uplift_test_max['reward'],
                #       uplift_test_max['response'], uplift_test_max['control'], uplift_test_max['algo_action_prob'])
                self.log.write('max_result_record:' +
                               str(max_result_record) + '\n')

            else:
                validate_cur_steps += 1

            if validate_cur_steps >= validate_max_steps:
                print('Training Finished', epoch)
                print('uplift_test_max', uplift_test_max,
                      'uplift_validate_max', uplift_validate_max)
                print('max result', max_result_record)
                self.log.write('Training Finished:' + str(epoch) + '\n')
                self.log.write('uplift_validate_max:' +
                               str(uplift_validate_max) + '\n')
                self.sess.close()
                sys.exit()

            if epoch % self.output_steps == 0 and epoch > 0:
                print('Epoch', epoch)
                print('uplift_test_max', uplift_test_max)
                print('uplift_validate_max', uplift_validate_max)
                print('max result', max_result_record)
                # self.results_calc(eval_func=self.eval_func, outputs_list=[
                #                   'test', 'validate'], epoch=epoch, result_output=False)

    # def results_store(self):
    #     self.results_trains =
    def save_model(self, sess):
        builder = tf.saved_model.builder.SavedModelBuilder(
            self.saved_model_dir)
        # x 为输入tensor, keep_prob为dropout的prob tensor
        inputs = {'input_x': tf.saved_model.utils.build_tensor_info(
            self.learner.tf_obs)}

        # y 为最终需要的输出结果tensor
        outputs = {'output': tf.saved_model.utils.build_tensor_info(
            self.learner.all_act_prob)}

        signature = tf.saved_model.signature_def_utils.build_signature_def(
            inputs, outputs, 'test_sig_name')
        # signature = None

        builder.add_meta_graph_and_variables(
            sess, [tf.saved_model.tag_constants.SERVING], signature_def_map=signature)
        builder.save()

    def load_model(self, sess, _x):
        signature_key = 'test_signature'
        input_key = 'input_x:0'
        output_key = 'output:0'

        meta_graph_def = tf.saved_model.loader.load(
            sess, [tf.saved_model.tag_constants.SERVING], self.saved_model_dir)
        # 从meta_graph_def中取出SignatureDef对象
        signature = meta_graph_def.signature_def

        # 从signature中找出具体输入输出的tensor name
        x_tensor_name = signature[signature_key].inputs[input_key].name
        y_tensor_name = signature[signature_key].outputs[output_key].name

        # 获取tensor 并inference
        x = sess.graph.get_tensor_by_name(x_tensor_name)
        y = sess.graph.get_tensor_by_name(y_tensor_name)

        res = sess.run(y, feed_dict={x: _x})
        print('res shape', res.shape)

        return res

    def results_calc(self, epoch, eval_func, outputs_list=['train', 'test', 'validate'], result_output=False, isMax=False):
        res = {}
        if isMax is True:
            suffix = 'max'
        else:
            suffix = str(epoch).zfill(5)
        if 'train' in outputs_list:
            uplift_train = self.test(eval_func=eval_func,
                                     datas=self.trains, epoch=epoch, output_filename='train_' + suffix, result_output=result_output)
            res['train'] = uplift_train

        if 'validate' in outputs_list:
            uplift_validate = self.test(eval_func=eval_func,
                                        datas=self.validates, epoch=epoch, output_filename='validate_' + suffix, result_output=result_output)
            res['validate'] = uplift_validate

        if 'test' in outputs_list:
            uplift_test = self.test(eval_func=eval_func,
                                    datas=self.tests, epoch=epoch, output_filename='test_' + suffix, result_output=result_output)
            res['test'] = uplift_test

        print('Epoch', epoch)
        self.log.write('Epoch:' + str(epoch) + '\n')
        for name in res.keys():
            print(name, res[name])
        return res

    def test(self, datas, epoch, eval_func, output_filename=None, result_output=False):
        '''
        Test on the datas = [feature, action_real, reaction]
        '''
        real_probs = np.ones(self.n_action) / self.n_action
        records = []
        features = [data[0] for data in datas]
        actions_algo, probs = self.learner.choose_action(
            features, mode='random', greedy=None)

        # print('probs', probs)
        # for i, a in enumerate(actions_algo):
        #     record.append([int(a), datas[i][1], datas[i][2], None])

        # for i, (a, data, prob) in enumerate(zip(actions_algo, datas, probs)):
        for i, (data, action, prob) in enumerate(zip(datas, actions_algo, probs)):
            # action = int(actions_algo[0])
            # prob = probs[0]
            if self.Zero_is_Action is False:
                action = action + 1
                tmp = [0.0]
                tmp.extend(prob.tolist())
                prob = np.array(tmp)

            records.append(
                [action, data[1], {'reward': data[2]}, real_probs, prob])

        # reactions = [datas[i][2] for i, a in enumerate(actions_algo)]
        # reactions = np.array(reactions)
        # reactions = np.reshape(reactions, (len(datas), 1))

        # if output_filename is not None and result_output is True:
        #     name_func = getattr(self.eval_func, '__name__')
        #     np.save('../output/' + output_filename + '_' + name_func + '_' + self.start_time,
        #             np.hstack((probs, reactions, actions_real)))
        #     print(output_filename + '_' + name_func + ' saved')
        ans = eval_func(records=records, treatment_weight=self.treatment_weight,
                        treatment_keys=self.treatment_keys, n_action=self.n_action)
        return ans
Exemple #7
0
from PolicyGradient import PolicyGradient
import matplotlib.pyplot as plt

DISPLAY_REWARD_THRESHOLD = -2000
RENDER = False

env = gym.make('MountainCar-v0')
env.seed(1)
env = env.unwrapped

print("action space:", env.action_space)
print("observation space:", env.observation_space, " , high:", env.observation_space.high, " , low:", env.observation_space.low)

pg = PolicyGradient(
    n_actions=env.action_space.n,
    n_features=env.observation_space.shape[0],
    learning_rate=0.02,
    reward_decay=0.995,
)

for i_episode in range(1000):
    observation = env.reset()
    while True:
        if RENDER:
            env.render()
        action = pg.choose_action(observation)
        observation_, reward, done, info = env.step(action)
        pg.store_transition(observation, action, reward)

        if done:
            ep_rs_sum = sum(pg.ep_rs)
Exemple #8
0
                    metavar='G',
                    help='learning rate (default: 1e-4)')
parser.add_argument('--batch_size',
                    type=int,
                    default=5,
                    metavar='G',
                    help='Every how many episodes to da a param update')
parser.add_argument('--seed',
                    type=int,
                    default=87,
                    metavar='N',
                    help='random seed (default: 87)')

args = parser.parse_args()

policy = PolicyGradient()

env = gym.make('trade-v0')
env.seed(args.seed)
torch.manual_seed(args.seed)

optimizer = optim.RMSprop(policy.parameters(),
                          lr=args.learning_rate,
                          weight_decay=args.decay_rate)

# # check & load pretrain model
# if os.path.isfile('pg_params.pkl'):
#     print('Load Policy Network parameters ...')
#     policy.load_state_dict(torch.load('pg_params.pkl'))

class subScheduler():
    def __init__(self, path_name, surffix, path_surffix):
        """
        parameters set
        """
        self.NUM_NODES = params['number of nodes in the cluster']
        self.NUM_APPS = 7
        # self.NUM_CONTAINERS = params['number of containers']

        # self.sim = Simulator()
        # self.env = LraClusterEnv(num_nodes=self.NUM_NODES)

        ckpt_path_1 = path_surffix + path_name + "_1" + "/model.ckpt"
        ckpt_path_2 = path_surffix + path_name + "_2" + "/model.ckpt"
        ckpt_path_3 = path_surffix + path_name + "_3" + "/model.ckpt"
        self.nodes_per_group = int(params['nodes per group'])
        # self.number_of_node_groups = int(self.NUM_NODES / self.nodes_per_group)
        """
        Build Network
        """
        self.n_actions = self.nodes_per_group  #: 3 nodes per group
        self.n_features = int(self.n_actions * self.NUM_APPS + 1 +
                              self.NUM_APPS)  #: 29

        self.RL_1 = PolicyGradient(n_actions=self.n_actions,
                                   n_features=self.n_features,
                                   learning_rate=params['learning rate'],
                                   suffix=surffix + '1')

        self.RL_2 = PolicyGradient(n_actions=self.n_actions,
                                   n_features=self.n_features,
                                   learning_rate=params['learning rate'],
                                   suffix=surffix + '2')

        self.RL_3 = PolicyGradient(n_actions=self.n_actions,
                                   n_features=self.n_features,
                                   learning_rate=params['learning rate'],
                                   suffix=surffix + '3')

        self.RL_1.restore_session(ckpt_path_1)
        self.RL_2.restore_session(ckpt_path_2)
        self.RL_3.restore_session(ckpt_path_3)

    def batch_data(self, rnd_array):
        index_data = []
        for i in range(7):
            index_data.extend([i] * rnd_array[i])
        return rnd_array, index_data

    def get_total_tput(self, rnd_array):

        # assert sum(rnd_array) == 81
        source_batch, index_data = self.batch_data(
            rnd_array.astype(int))  # index_data = [0,1,2,0,1,2]
        env = LraClusterEnv(num_nodes=self.NUM_NODES, ifSimulator=False)
        observation = env.reset().copy()  # (9,9)
        """
        Episode
        """
        for inter_episode_index in range(int(sum(rnd_array))):
            # observation_new_list = []
            # observation[:, index_data[inter_episode_index]] += 1
            source_batch[index_data[inter_episode_index]] -= 1
            observation, mapping_index = handle_constraint(
                observation, self.NUM_NODES)

            assert len(mapping_index) > 0

            observation_first_layer = np.empty([0, env.NUM_APPS], int)
            number_of_first_layer_nodes = int(self.NUM_NODES /
                                              self.nodes_per_group)  # 9
            for i in range(self.nodes_per_group):
                observation_new = np.sum(
                    observation[i * number_of_first_layer_nodes:(i + 1) *
                                number_of_first_layer_nodes],
                    0).reshape(1, -1)
                observation_first_layer = np.append(observation_first_layer,
                                                    observation_new, 0)
            observation_first_layer[:, index_data[inter_episode_index]] += 1
            observation_first_layer = np.append(
                np.append(observation_first_layer,
                          index_data[inter_episode_index]),
                np.array(source_batch)).reshape(1, -1)
            # observation_first_layer = np.array(observation_first_layer).reshape(1, -1)
            # observation_first_layer = np.append(observation_first_layer, index_data[inter_episode_index]).reshape(1, -1)
            # observation_first_layer = np.append(observation_first_layer, np.array(source_batch)).reshape(1, -1)  # (1,29)

            action_1, prob_weights = self.RL_1.choose_action_determine(
                observation_first_layer)

            observation_copy = observation
            observation_copy = observation_copy[action_1 *
                                                number_of_first_layer_nodes:
                                                (action_1 + 1) *
                                                number_of_first_layer_nodes]
            number_of_second_layer_nodes = int(number_of_first_layer_nodes /
                                               self.nodes_per_group)  # 9/3 = 3
            observation_second_layer = np.empty([0, env.NUM_APPS], int)
            for i in range(self.nodes_per_group):
                observation_new = np.sum(
                    observation_copy[i * number_of_second_layer_nodes:(i + 1) *
                                     number_of_second_layer_nodes],
                    0).reshape(1, -1)
                observation_second_layer = np.append(observation_second_layer,
                                                     observation_new, 0)
            observation_second_layer[:, index_data[inter_episode_index]] += 1
            observation_second_layer = np.append(
                np.append(observation_second_layer,
                          index_data[inter_episode_index]),
                np.array(source_batch)).reshape(1, -1)

            # observation_second_layer = np.array(observation_second_layer).reshape(1, -1)
            # observation_second_layer = np.append(observation_second_layer, index_data[inter_episode_index]).reshape(1, -1)
            # observation_second_layer = np.append(observation_second_layer, np.array(source_batch)).reshape(1, -1)
            action_2, prob_weights = self.RL_2.choose_action_determine(
                observation_second_layer)

            observation_copy = observation_copy[action_2 *
                                                number_of_second_layer_nodes:
                                                (action_2 + 1) *
                                                number_of_second_layer_nodes]
            number_of_third_layer_nodes = int(number_of_second_layer_nodes /
                                              self.nodes_per_group)  # 3/3 = 1
            observation_third_layer = np.empty([0, env.NUM_APPS], int)
            for i in range(self.nodes_per_group):
                observation_new = np.sum(
                    observation_copy[i * number_of_third_layer_nodes:(i + 1) *
                                     number_of_third_layer_nodes],
                    0).reshape(1, -1)
                observation_third_layer = np.append(observation_third_layer,
                                                    observation_new, 0)
            observation_third_layer[:, index_data[inter_episode_index]] += 1
            observation_third_layer = np.append(
                np.append(observation_third_layer,
                          index_data[inter_episode_index]),
                np.array(source_batch)).reshape(1, -1)

            # observation_third_layer = np.array(observation_third_layer).reshape(1, -1)
            # observation_third_layer = np.append(observation_third_layer, index_data[inter_episode_index]).reshape(1, -1)
            # observation_third_layer = np.append(observation_third_layer, np.array(source_batch)).reshape(1, -1)

            action_3, prob_weights = self.RL_3.choose_action_determine(
                observation_third_layer)

            final_decision = action_1 * number_of_first_layer_nodes + action_2 * number_of_second_layer_nodes + action_3 * number_of_third_layer_nodes

            appid = index_data[inter_episode_index]
            # observation_ = env.step(action*nodes_per_group + Node_index[action], appid)
            observation_ = env.step(mapping_index[final_decision], appid)
            observation = observation_.copy()  # (9,9)
        """
        After an entire allocation, calculate total throughput, reward
        """
        state = env.get_tput_total_env()
        # assert sum(sum(self.env.state)) == 81

        return state
Exemple #10
0
def train(params):
    """
    parameters set
    """
    NUM_NODES = params['number of nodes in the cluster']
    env = LraClusterEnv(num_nodes=NUM_NODES)
    batch_size = params['batch_size']
    ckpt_path_1 = "./checkpoint/" + params['path'] + "_1" + "/model.ckpt"
    ckpt_path_2 = "./checkpoint/" + params['path'] + "_2" + "/model.ckpt"
    ckpt_path_3 = "./checkpoint/" + params['path'] + "_3" + "/model.ckpt"
    np_path = "./checkpoint/" + params['path'] + "/optimal_file_name.npz"
    Recover = params['recover']
    nodes_per_group = int(params['nodes per group'])
    replay_size = params['replay size']
    training_times_per_episode = 1
    UseExperienceReplay = False
    """
    Build Network
    """
    n_actions = nodes_per_group  #: 3 nodes per group
    n_features = int(n_actions * env.NUM_APPS + 1 +
                     env.NUM_APPS)  #: 3*7+1+7 = 29
    RL_1 = PolicyGradient(n_actions=n_actions,
                          n_features=n_features,
                          learning_rate=params['learning rate'],
                          suffix=str(params['NUM_CONTAINERS_start']) + '1')

    RL_2 = PolicyGradient(n_actions=n_actions,
                          n_features=n_features,
                          learning_rate=params['learning rate'],
                          suffix=str(params['NUM_CONTAINERS_start']) + '2')

    RL_3 = PolicyGradient(n_actions=n_actions,
                          n_features=n_features,
                          learning_rate=params['learning rate'],
                          suffix=str(params['NUM_CONTAINERS_start']) + '3')
    sim = Simulator()
    """
    Training
    """
    start_time = time.time()
    global_start_time = start_time

    observation_episode_1, action_episode_1, reward_episode_1 = [], [], []
    observation_episode_2, action_episode_2, reward_episode_2 = [], [], []
    observation_episode_3, action_episode_3, reward_episode_3 = [], [], []
    epoch_i = 0
    entropy_weight = 0.01
    names = locals()
    for i in range(0, 10):
        names['highest_tput_' + str(i)] = 0.1
        names['observation_optimal_1_' + str(i)] = []
        names['action_optimal_1_' + str(i)] = []
        names['reward_optimal_1_' + str(i)] = []
        names['number_optimal_' + str(i)] = []
        names['optimal_range_' + str(i)] = 1.2

    for i in range(0, 10):
        names['observation_optimal_2_' + str(i)] = []
        names['action_optimal_2_' + str(i)] = []
        names['reward_optimal_2_' + str(i)] = []

    for i in range(0, 10):
        names['observation_optimal_3_' + str(i)] = []
        names['action_optimal_3_' + str(i)] = []
        names['reward_optimal_3_' + str(i)] = []

    # TODO: delete this range

    def store_episode_1(observations, actions):
        observation_episode_1.append(observations)
        action_episode_1.append(actions)

    def store_episode_2(observations, actions):
        observation_episode_2.append(observations)
        action_episode_2.append(actions)

    def store_episode_3(observations, actions):
        observation_episode_3.append(observations)
        action_episode_3.append(actions)

    NUM_CONTAINERS_start = params['NUM_CONTAINERS_start']

    while epoch_i < params['epochs']:

        NUM_CONTAINERS = np.random.randint(NUM_CONTAINERS_start + 1,
                                           NUM_CONTAINERS_start + 11)
        tput_origimal_class = int(NUM_CONTAINERS - NUM_CONTAINERS_start - 1)
        source_batch_, index_data = batch_data(
            NUM_CONTAINERS, env.NUM_APPS)  # index_data = [0,1,2,0,1,2]
        observation = env.reset().copy()  # (9,9)
        source_batch = source_batch_.copy()

        for inter_episode_index in range(NUM_CONTAINERS):

            appid = index_data[inter_episode_index]
            observation_ = env.step(inter_episode_index % NUM_NODES,
                                    appid)  # load-balancing
            observation = observation_.copy()  # (9,9)
        tput_state = env.get_tput_total_env()
        tput_baseline = (sim.predict(tput_state.reshape(-1, env.NUM_APPS)) *
                         tput_state).sum() / NUM_CONTAINERS
        """
        Episode
        """
        observation = env.reset().copy()
        for inter_episode_index in range(NUM_CONTAINERS):
            source_batch[index_data[inter_episode_index]] -= 1
            observation, mapping_index = handle_constraint(
                observation.copy(), NUM_NODES)
            assert len(mapping_index) > 0

            observation_first_layer = np.empty([0, env.NUM_APPS], int)
            number_of_first_layer_nodes = int(NUM_NODES / nodes_per_group)  # 9
            for i in range(nodes_per_group):
                observation_new = np.sum(
                    observation[i * number_of_first_layer_nodes:(i + 1) *
                                number_of_first_layer_nodes],
                    0).reshape(1, -1)
                observation_first_layer = np.append(observation_first_layer,
                                                    observation_new, 0)
            observation_first_layer[:, index_data[inter_episode_index]] += 1
            observation_first_layer = np.array(
                observation_first_layer).reshape(1, -1)
            observation_first_layer = np.append(
                observation_first_layer,
                index_data[inter_episode_index]).reshape(1, -1)
            observation_first_layer = np.append(
                observation_first_layer,
                np.array(source_batch)).reshape(1, -1)  # (1,29)
            action_1, prob_weights = RL_1.choose_action(
                observation_first_layer.copy())

            observation_copy = observation.copy()
            observation_copy = observation_copy[action_1 *
                                                number_of_first_layer_nodes:
                                                (action_1 + 1) *
                                                number_of_first_layer_nodes]
            number_of_second_layer_nodes = int(number_of_first_layer_nodes /
                                               nodes_per_group)  # 9/3 = 3
            observation_second_layer = np.empty([0, env.NUM_APPS], int)
            for i in range(nodes_per_group):
                observation_new = np.sum(
                    observation_copy[i * number_of_second_layer_nodes:(i + 1) *
                                     number_of_second_layer_nodes],
                    0).reshape(1, -1)
                observation_second_layer = np.append(observation_second_layer,
                                                     observation_new, 0)
            observation_second_layer[:, index_data[inter_episode_index]] += 1
            observation_second_layer = np.array(
                observation_second_layer).reshape(1, -1)
            observation_second_layer = np.append(
                observation_second_layer,
                index_data[inter_episode_index]).reshape(1, -1)
            observation_second_layer = np.append(
                observation_second_layer,
                np.array(source_batch)).reshape(1, -1)
            action_2, prob_weights = RL_2.choose_action(
                observation_second_layer.copy())

            observation_copy = observation_copy[action_2 *
                                                number_of_second_layer_nodes:
                                                (action_2 + 1) *
                                                number_of_second_layer_nodes]
            number_of_third_layer_nodes = int(number_of_second_layer_nodes /
                                              nodes_per_group)  # 3/3 = 1
            observation_third_layer = np.empty([0, env.NUM_APPS], int)
            for i in range(nodes_per_group):
                observation_new = np.sum(
                    observation_copy[i * number_of_third_layer_nodes:(i + 1) *
                                     number_of_third_layer_nodes],
                    0).reshape(1, -1)
                observation_third_layer = np.append(observation_third_layer,
                                                    observation_new, 0)
            observation_third_layer[:, index_data[inter_episode_index]] += 1
            observation_third_layer = np.array(
                observation_third_layer).reshape(1, -1)
            observation_third_layer = np.append(
                observation_third_layer,
                index_data[inter_episode_index]).reshape(1, -1)
            observation_third_layer = np.append(
                observation_third_layer,
                np.array(source_batch)).reshape(1, -1)
            action_3, prob_weights = RL_3.choose_action(
                observation_third_layer.copy())

            final_decision = action_1 * number_of_first_layer_nodes + action_2 * number_of_second_layer_nodes + action_3 * number_of_third_layer_nodes

            appid = index_data[inter_episode_index]
            observation_ = env.step(mapping_index[final_decision], appid)

            store_episode_1(observation_first_layer, action_1)
            store_episode_2(observation_second_layer, action_2)
            store_episode_3(observation_third_layer, action_3)
            observation = observation_.copy()  # (9,9)
        """
        After an entire allocation, calculate total throughput, reward
        """
        tput_state = env.get_tput_total_env()
        tput = (sim.predict(tput_state.reshape(-1, env.NUM_APPS)) *
                tput_state).sum() / NUM_CONTAINERS

        RL_1.store_tput_per_episode(tput, epoch_i)
        assert (np.sum(env.state, axis=1) <=
                params['container_limitation per node']).all()
        assert sum(sum(env.state)) == NUM_CONTAINERS

        reward_ratio = (tput - tput_baseline)

        reward_episode_1 = [reward_ratio] * len(observation_episode_1)
        reward_episode_2 = [reward_ratio] * len(observation_episode_2)
        reward_episode_3 = [reward_ratio] * len(observation_episode_3)

        RL_1.store_training_samples_per_episode(observation_episode_1,
                                                action_episode_1,
                                                reward_episode_1)
        RL_2.store_training_samples_per_episode(observation_episode_2,
                                                action_episode_2,
                                                reward_episode_2)
        RL_3.store_training_samples_per_episode(observation_episode_3,
                                                action_episode_3,
                                                reward_episode_3)
        """
        check_tput_quality(tput)
        """
        if names['highest_tput_' + str(tput_origimal_class)] < tput:
            highest_tput_original = names['highest_tput_' +
                                          str(tput_origimal_class)]
            optimal_range_original = names['optimal_range_' +
                                           str(tput_origimal_class)]
            names['highest_tput_' + str(tput_origimal_class)] = tput
            names['number_optimal_' + str(tput_origimal_class)] = []
            names['observation_optimal_1_' + str(tput_origimal_class)], names[
                'action_optimal_1_' + str(tput_origimal_class)], names[
                    'reward_optimal_1_' +
                    str(tput_origimal_class)] = [], [], []
            names['observation_optimal_2_' + str(tput_origimal_class)], names[
                'action_optimal_2_' + str(tput_origimal_class)], names[
                    'reward_optimal_2_' +
                    str(tput_origimal_class)] = [], [], []
            names['observation_optimal_3_' + str(tput_origimal_class)], names[
                'action_optimal_3_' + str(tput_origimal_class)], names[
                    'reward_optimal_3_' +
                    str(tput_origimal_class)] = [], [], []
            if UseExperienceReplay:
                names['observation_optimal_1_' +
                      str(tput_origimal_class)].extend(observation_episode_1)
                names['action_optimal_1_' +
                      str(tput_origimal_class)].extend(action_episode_1)
                names['reward_optimal_1_' +
                      str(tput_origimal_class)].extend(reward_episode_1)

                names['observation_optimal_2_' +
                      str(tput_origimal_class)].extend(observation_episode_2)
                names['action_optimal_2_' +
                      str(tput_origimal_class)].extend(action_episode_2)
                names['reward_optimal_2_' +
                      str(tput_origimal_class)].extend(reward_episode_2)

                names['observation_optimal_3_' +
                      str(tput_origimal_class)].extend(observation_episode_3)
                names['action_optimal_3_' +
                      str(tput_origimal_class)].extend(action_episode_3)
                names['reward_optimal_3_' +
                      str(tput_origimal_class)].extend(reward_episode_3)

            names['number_optimal_' +
                  str(tput_origimal_class)].append(NUM_CONTAINERS)
            names['optimal_range_' + str(tput_origimal_class)] = min(
                1.2, tput / (highest_tput_original / optimal_range_original))
        elif names['highest_tput_' + str(tput_origimal_class)] < tput * names[
                'optimal_range_' + str(tput_origimal_class)]:

            if UseExperienceReplay:

                names['observation_optimal_1_' +
                      str(tput_origimal_class)].extend(observation_episode_1)
                names['action_optimal_1_' +
                      str(tput_origimal_class)].extend(action_episode_1)
                names['reward_optimal_1_' +
                      str(tput_origimal_class)].extend(reward_episode_1)

                names['observation_optimal_2_' +
                      str(tput_origimal_class)].extend(observation_episode_2)
                names['action_optimal_2_' +
                      str(tput_origimal_class)].extend(action_episode_2)
                names['reward_optimal_2_' +
                      str(tput_origimal_class)].extend(reward_episode_2)

                names['observation_optimal_3_' +
                      str(tput_origimal_class)].extend(observation_episode_3)
                names['action_optimal_3_' +
                      str(tput_origimal_class)].extend(action_episode_3)
                names['reward_optimal_3_' +
                      str(tput_origimal_class)].extend(reward_episode_3)

            names['number_optimal_' +
                  str(tput_origimal_class)].append(NUM_CONTAINERS)

        observation_episode_1, action_episode_1, reward_episode_1 = [], [], []
        observation_episode_2, action_episode_2, reward_episode_2 = [], [], []
        observation_episode_3, action_episode_3, reward_episode_3 = [], [], []
        """
        Each batch, RL.learn()
        """
        if (epoch_i % batch_size == 0) & (epoch_i > 1):
            if UseExperienceReplay:
                for replay_class in range(0, 10):

                    reward_optimal_1 = names['reward_optimal_1_' +
                                             str(replay_class)]
                    observation_optimal_1 = names['observation_optimal_1_' +
                                                  str(replay_class)]
                    action_optimal_1 = names['action_optimal_1_' +
                                             str(replay_class)]

                    reward_optimal_2 = names['reward_optimal_2_' +
                                             str(replay_class)]
                    observation_optimal_2 = names['observation_optimal_2_' +
                                                  str(replay_class)]
                    action_optimal_2 = names['action_optimal_2_' +
                                             str(replay_class)]

                    reward_optimal_3 = names['reward_optimal_3_' +
                                             str(replay_class)]
                    observation_optimal_3 = names['observation_optimal_3_' +
                                                  str(replay_class)]
                    action_optimal_3 = names['action_optimal_3_' +
                                             str(replay_class)]

                    number_optimal = names['number_optimal_' +
                                           str(replay_class)]

                    buffer_size = int(len(number_optimal))
                    assert sum(
                        number_optimal) * training_times_per_episode == len(
                            action_optimal_1)

                    if buffer_size < replay_size:
                        # TODO: if layers changes, training_times_per_episode should be modified
                        RL_1.ep_obs.extend(observation_optimal_1)
                        RL_1.ep_as.extend(action_optimal_1)
                        RL_1.ep_rs.extend(reward_optimal_1)

                        RL_2.ep_obs.extend(observation_optimal_2)
                        RL_2.ep_as.extend(action_optimal_2)
                        RL_2.ep_rs.extend(reward_optimal_2)

                        RL_3.ep_obs.extend(observation_optimal_3)
                        RL_3.ep_as.extend(action_optimal_3)
                        RL_3.ep_rs.extend(reward_optimal_3)

                    else:
                        replay_index = np.random.choice(range(buffer_size),
                                                        size=replay_size,
                                                        replace=False)
                        for replay_id in range(replay_size):
                            replace_start = replay_index[replay_id]
                            start_location = sum(number_optimal[:replace_start]
                                                 ) * training_times_per_episode
                            stop_location = sum(
                                number_optimal[:replace_start +
                                               1]) * training_times_per_episode

                            RL_1.ep_obs.extend(observation_optimal_1[
                                start_location:stop_location])
                            RL_1.ep_as.extend(
                                action_optimal_1[start_location:stop_location])
                            RL_1.ep_rs.extend(
                                reward_optimal_1[start_location:stop_location])

                            RL_2.ep_obs.extend(observation_optimal_2[
                                start_location:stop_location])
                            RL_2.ep_as.extend(
                                action_optimal_2[start_location:stop_location])
                            RL_2.ep_rs.extend(
                                reward_optimal_2[start_location:stop_location])

                            RL_3.ep_obs.extend(observation_optimal_3[
                                start_location:stop_location])
                            RL_3.ep_as.extend(
                                action_optimal_3[start_location:stop_location])
                            RL_3.ep_rs.extend(
                                reward_optimal_3[start_location:stop_location])

            # entropy_weight=0.1
            RL_1.learn(epoch_i, entropy_weight, True)
            RL_2.learn(epoch_i, entropy_weight, False)
            RL_3.learn(epoch_i, entropy_weight, False)
        """
        checkpoint, per 1000 episodes
        """
        if (epoch_i % 1000 == 0) & (epoch_i > 1):
            highest_value = 0
            for class_replay in range(0, 10):
                highest_value = names['highest_tput_' + str(class_replay)]
                optimal_number = len(names['number_optimal_' +
                                           str(class_replay)])
                print("\n epoch: %d, highest tput: %f, optimal_number: %d" %
                      (epoch_i, highest_value, optimal_number))
            RL_1.save_session(ckpt_path_1)
            RL_2.save_session(ckpt_path_2)
            RL_3.save_session(ckpt_path_3)

            np.savez(np_path,
                     tputs=np.array(RL_1.tput_persisit),
                     candidate=np.array(RL_1.episode))
            """
            optimal range adaptively change
            """
            print(prob_weights)
            print(prob_weights)
            entropy_weight *= 0.5
            entropy_weight = max(entropy_weight, 0.002)

        epoch_i += 1