コード例 #1
0
class ConstrainedDDPG(CMDPAgent):

    def init_parameters(self, sess):
        if self.has_target_net:
            super(CMDPAgent, self).init_parameters(sess)
            sess.run(self.target_replace_op)
            sess.run(self.a_target_replace_op)

    def __init__(self, user_num, action_dim, action_bound, cvr_n_features, ddpg_n_features, init_roi, budget,
                 use_budget_control,
                 use_prioritized_experience_replay,
                 max_trajectory_length,
                 update_times_per_train=1, use_predict_cvr=False):
        self.user_num = user_num
        self.use_budget_control = use_budget_control
        self.update_times_per_train = update_times_per_train
        self.action_dim = action_dim
        self.action_bound = action_bound
        self.n_actions = 1
        self.cvr_n_features = cvr_n_features
        self.ddpg_n_features = ddpg_n_features
        self.lr = 0.001
        self.use_predict_cvr = use_predict_cvr

        self.user_based_adjust_times = 40
        self.epsilon = 0.9
        self.epsilon_min = 0.05

        self.epsilon_dec = 0.3
        self.epsilon_dec_iter = 5000 // self.user_based_adjust_times
        self.epsilon_dec_iter_min = 500 // self.user_based_adjust_times

        self.replace_target_iter = 1
        self.soft_update_iter = 1
        self.softupdate = True

        self.scope_name = "CDDPG-model"

        self.epoch = 0

        self.exploration_noise = OUNoise(self.action_dim)

        self.cvr_buffer_size = 1000 * max_trajectory_length
        self.cvr_batch_size = 512
        self.cvr_replay_buffer = ReplayBuffer(self.cvr_buffer_size, save_return=False)

        self.alpha = 0.6
        self.beta = 0.4
        self.use_prioritized_experience_replay = use_prioritized_experience_replay

        self.ddpg_buffer_size = 1000 * max_trajectory_length

        self.ddpg_batch_size = 256
        if self.use_prioritized_experience_replay:
            self.prioritized_replay_buffer = PrioritizedReplayBuffer(self.ddpg_buffer_size, alpha=self.alpha,
                                                                     max_priority=20.)
        else:
            self.replay_buffer = ReplayBuffer(self.ddpg_buffer_size, save_return=True)

        with tf.variable_scope(self.scope_name):

            self._build_net()

            self.build_model_saver(self.scope_name)

    def _build_cvr_net(self, state, variable_scope, reuse=False):
        with tf.variable_scope(variable_scope, reuse=reuse):
            user_id_embedding_table = tf.get_variable(
                name="user_id", shape=[self.user_num, 10], initializer=initializers.xavier_initializer(),
                trainable=True, dtype=tf.float32)
            user_id = tf.cast(state[:, 0], dtype=tf.int32)
            user_id_embeddings = tf.nn.embedding_lookup(user_id_embedding_table, ids=user_id, name="user_id_embedding")
            state = tf.concat([user_id_embeddings, state[:, 1:]], axis=1)
            n_features = state.get_shape()[1]
            fc1 = tf.layers.dense(state, units=n_features, activation=tf.nn.relu, name='fc1',
                                  kernel_initializer=initializers.xavier_initializer())

            fc2 = tf.layers.dense(fc1, units=n_features // 2, activation=tf.nn.relu, name='fc2',
                                  kernel_initializer=initializers.xavier_initializer())

            fc3 = tf.layers.dense(fc2, units=n_features // 2, activation=tf.nn.relu, name='fc3',
                                  kernel_initializer=initializers.xavier_initializer())
            cvr_out = tf.sigmoid(tf.layers.dense(fc3, units=1, name='cvr',
                                                 kernel_initializer=initializers.xavier_initializer()))
            return cvr_out

    def _build_q_net(self, state, action, variable_scope, reuse=False):
        with tf.variable_scope(variable_scope, reuse=reuse):
            user_id_embedding_table = tf.get_variable(
                name="user_id", shape=[self.user_num, 10], initializer=initializers.xavier_initializer(),
                trainable=True, dtype=tf.float32)
            user_id = tf.cast(state[:, 0], dtype=tf.int32)
            user_id_embeddings = tf.nn.embedding_lookup(user_id_embedding_table, ids=user_id, name="user_id_embedding")
            state = tf.concat([user_id_embeddings, state[:, 1:]], axis=1)

            n_features = state.get_shape()[1]

            state = tf.concat([state, tf.expand_dims(action, axis=1, name="2d-action")], axis=1)
            fc1 = tf.layers.dense(state, units=n_features, activation=tf.nn.relu, name='fc1')

            fc2 = tf.layers.dense(fc1, units=n_features // 2, activation=tf.nn.relu, name='fc2')

            q = tf.layers.dense(fc2, units=self.action_dim, name='q')

            return q[:, 0]

    def _build_action_net(self, state, variable_scope):
        with tf.variable_scope(variable_scope):
            user_id_embedding_table = tf.get_variable(
                name="user_id", shape=[self.user_num, 10], initializer=initializers.xavier_initializer(),
                trainable=True, dtype=tf.float32)
            user_id = tf.cast(state[:, 0], dtype=tf.int32)
            user_id_embeddings = tf.nn.embedding_lookup(user_id_embedding_table, ids=user_id, name="user_id_embedding")
            state = tf.concat([user_id_embeddings, state[:, 1:]], axis=1)

            n_features = state.get_shape()[1]
            fc1 = tf.layers.dense(state, units=n_features, activation=tf.nn.relu, name='fc1')
            fc2 = tf.layers.dense(fc1, units=n_features // 2, activation=tf.nn.relu, name='fc2')

            actions = tf.layers.dense(fc2, self.action_dim, activation=tf.nn.sigmoid, name='a')

            return actions[:, 0]

    def __make_update_exp__(self, vals, target_vals):
        polyak = 1.0 - 1e-2
        expression = []
        for var, var_target in zip(sorted(vals, key=lambda v: v.name), sorted(target_vals, key=lambda v: v.name)):
            expression.append(var_target.assign(polyak * var_target + (1.0 - polyak) * var))
        expression = tf.group(*expression)
        return expression

    def _build_net(self):

        self.s_cvr = tf.placeholder(tf.float32, [None, self.cvr_n_features], name='s_cvr')
        self.cvr = tf.placeholder(tf.float32, [None, ], name='r')

        self.s = tf.placeholder(tf.float32, [None, self.ddpg_n_features], name='s')
        self.s_ = tf.placeholder(tf.float32, [None, self.ddpg_n_features], name='s_')
        self.r = tf.placeholder(tf.float32, [None, ], name='r')
        self.a = tf.placeholder(tf.float32, [None, ], name='a')
        self.gamma = 1.
        self.done = tf.placeholder(tf.float32, [None, ], name='done')
        self.return_value = tf.placeholder(tf.float32, [None, ], name='return')
        self.important_sampling_weight_ph = tf.placeholder(tf.float32, [None], name="important_sampling_weight")

        self.cvr_net = self._build_cvr_net(self.s_cvr, variable_scope="cvr_net")
        self.predicted_cvr = self.cvr_net[:, 0]
        self.a_eval = self._build_action_net(self.s, variable_scope="actor_eval_net")
        self.a_target = self._build_action_net(self.s_, variable_scope="actor_target_net")
        self.critic_eval = self._build_q_net(self.s, self.a, variable_scope="eval_q_net")
        self.critic_eval_for_loss = self._build_q_net(self.s, self.a_eval, variable_scope="eval_q_net",
                                                      reuse=True)
        self.critic_target = self._build_q_net(self.s_, self.a, variable_scope="target_q_net")

        t_gmv_params = scope_vars(absolute_scope_name("target_q_net"))
        e_gmv_params = scope_vars(absolute_scope_name("eval_q_net"))

        ae_params = scope_vars(absolute_scope_name("actor_eval_net"))
        at_params = scope_vars(absolute_scope_name("actor_target_net"))

        cvr_params = scope_vars(absolute_scope_name("cvr_net"))

        with tf.variable_scope('hard_replacement'):
            self.a_target_replace_op = tf.group([tf.assign(t, e) for t, e in zip(at_params, ae_params)])
            self.target_replace_op = tf.group([tf.assign(t, e) for t, e in zip(t_gmv_params, e_gmv_params)])

        with tf.variable_scope('soft_update'):
            self.a_update_target_q = self.__make_update_exp__(ae_params, at_params)
            self.update_target_q = self.__make_update_exp__(e_gmv_params, t_gmv_params)

        with tf.variable_scope('q_target'):
            self.td0_q_target = tf.stop_gradient(self.r + self.gamma * (1. - self.done) * self.critic_target)

            self.montecarlo_target = self.return_value

        with tf.variable_scope('loss'):
            self.cvr_loss = tf.reduce_mean(tf.squared_difference(self.predicted_cvr, self.cvr))

            self._build_loss()

            self._pick_loss()

        with tf.variable_scope('train'):
            self._train_cvr_op = tf.train.AdamOptimizer(self.lr).minimize(self.cvr_loss, var_list=cvr_params)
            self._train_ddpg_critic_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss, var_list=e_gmv_params)
            self._train_ddpg_a_op = tf.train.AdamOptimizer(self.lr).minimize(self.actor_loss, var_list=ae_params)

    def _pick_loss(self):
        self.has_target_net = True

        self.loss = self.ddpg_loss
        self.priority_values = self.td0_error
        self.actor_loss = self.a_loss

    def _build_loss(self):

        if self.use_prioritized_experience_replay:

            self.ddpg_loss = tf.reduce_mean(
                self.important_sampling_weight_ph * tf.squared_difference(self.td0_q_target, self.critic_eval,
                                                                          name='TD0_loss'))

            self.montecarlo_loss = tf.reduce_mean(self.important_sampling_weight_ph *
                                                  tf.squared_difference(self.montecarlo_target, self.critic_eval,
                                                                        name='MonteCarlo_error'))

        else:

            self.ddpg_loss = tf.reduce_mean(tf.squared_difference(self.td0_q_target, self.critic_eval, name='TD0_loss'))

            self.montecarlo_loss = tf.reduce_mean(tf.squared_difference(self.montecarlo_target, self.critic_eval,
                                                                        name='MonteCarlo_error'))

        self.a_loss = - tf.reduce_mean(self.critic_eval_for_loss)

        self.td0_error = tf.abs(self.td0_q_target - self.critic_eval)

        self.montecarlo_error = tf.abs(self.montecarlo_target - self.critic_eval)

    def build_model_saver(self, var_scope):
        var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=var_scope)

        self.model_saver = tf.train.Saver(var_list=var_list, max_to_keep=1)

    def save(self, sess, path, step):
        if not os.path.exists(os.path.dirname(path)):
            os.makedirs(os.path.dirname(path))
        self.model_saver.save(sess, save_path=path, global_step=step)

    def restore(self, sess, path):
        self.model_saver.restore(sess, save_path=path)
        print('%s model reloaded from %s' % (self.scope_name, path))

    def experience(self, new_trajectory, other_info=None):
        cvr_trajectory = other_info["cvr"]
        for ele in cvr_trajectory:
            state, cvr = ele
            self.cvr_replay_buffer.add(state, 0, cvr, state, 0, 0, 0)

    def experience_cmdp(self, new_trajectory, other_info=None):
        if self.use_prioritized_experience_replay:
            add_episode(self.prioritized_replay_buffer, new_trajectory, gamma=self.gamma)
        else:
            add_episode(self.replay_buffer, new_trajectory, gamma=self.gamma)

    def get_agent_name(self):
        return self.scope_name

    def get_action(self, sess, obs, is_test=False, other_info=None):
        item_price = other_info["proxy_ad_price"]
        ground_truth_cvr = other_info["cvr"]
        user_alpha = other_info["user_alpha"]
        roi_thr = other_info["roi_thr"]

        observations = obs[np.newaxis, :]
        cvr = sess.run(self.predicted_cvr, feed_dict={
            self.s_cvr: observations
        })[0]
        if self.use_predict_cvr:
            bid = cvr * item_price / roi_thr
        else:
            bid = ground_truth_cvr * item_price / roi_thr
        return bid, {"cvr_over_estimate": [user_alpha, ground_truth_cvr, cvr]}

    def get_cmdp_action(self, sess, obs, is_test=False, other_info=None):
        if is_test:
            discrete_action = self.__greedy__(sess, obs)
        else:
            discrete_action = self.__epsilon_greedy__(sess, obs)

        return discrete_action

    def __greedy__(self, sess, observation):
        observation = observation[np.newaxis, :]
        greedy_action = sess.run(self.a_eval, feed_dict={self.s: observation})

        return greedy_action[0]

    def __epsilon_greedy__(self, sess, observation):
        if np.random.uniform() < self.epsilon:
            observation = observation[np.newaxis, :]
            actions_value = sess.run(self.a_eval, feed_dict={self.s: observation})

            action_noise = self.exploration_noise.noise()

            action = actions_value + action_noise

            action = action[0]


        else:
            action = self.__greedy__(sess, observation)
        return action

    def _is_exploration_enough(self, buffer, min_pool_size):
        return len(buffer) >= min_pool_size

    def train_cvr(self, sess):
        if not self._is_exploration_enough(self.cvr_replay_buffer, self.cvr_batch_size):
            return False, [0, 0, 0]

        cvr_loss, predicted_cvrs, cvr_targets = 0, 0, 0
        for idx in range(self.update_times_per_train):
            sample_indices = self.cvr_replay_buffer.make_index(self.cvr_batch_size)
            obs, act, cvr_targets, obs_next, done, dis_2_end, returns = self.cvr_replay_buffer.sample_index(
                sample_indices)

            _, cvr_loss, predicted_cvrs = sess.run(
                [self._train_cvr_op, self.cvr_loss, self.predicted_cvr],
                feed_dict={
                    self.s_cvr: obs,
                    self.cvr: cvr_targets
                }
            )
        return True, [cvr_loss, np.average(predicted_cvrs), np.average(cvr_targets)]

    def get_memory_returns(self):
        if self.use_prioritized_experience_replay:
            return self.prioritized_replay_buffer.current_mean_return
        else:
            return self.replay_buffer.current_mean_return

    def update_target(self, sess):
        if self.softupdate:

            if self.epoch % self.soft_update_iter == 0:
                sess.run(self.update_target_q)
                sess.run(self.a_update_target_q)
        else:

            if self.epoch % self.replace_target_iter == 0:
                sess.run(self.target_replace_op)
                sess.run(self.a_target_replace_op)

    def train(self, sess):
        if self.has_target_net:
            self.update_target(sess)
        self.epoch += 1

        buffer = self.prioritized_replay_buffer if self.use_prioritized_experience_replay else self.replay_buffer
        if not self._is_exploration_enough(buffer, self.ddpg_batch_size):
            return False, [0, 0, 0, 0], 0, 0

        if self.use_prioritized_experience_replay:

            loss, montecarlo_loss, q_eval, returns = self.train_prioritized(sess)
        else:

            loss, montecarlo_loss, q_eval, returns = self.train_normal(sess)

        if self.epoch % self.epsilon_dec_iter == 0:
            self.epsilon = max(self.epsilon - self.epsilon_dec, self.epsilon_min)

            print("update epsilon:", self.epsilon)
        return True, [loss, montecarlo_loss, q_eval, returns], self.get_memory_returns(), self.epsilon

    def train_prioritized(self, sess):
        loss, montecarlo_loss, q_eval, returns = 0, 0, 0, 0
        for idx in range(self.update_times_per_train):
            sample_indices = self.prioritized_replay_buffer.make_index(self.ddpg_batch_size)
            obs, act, rew, obs_next, done, dis_2_end, returns, weights, ranges = self.prioritized_replay_buffer.sample_index(
                sample_indices)
            _, loss, montecarlo_loss, q_eval, \
            priority_values = sess.run(
                [self._train_ddpg_critic_op, self.loss, self.montecarlo_loss, self.critic_eval,
                 self.priority_values],
                feed_dict={
                    self.s: obs,
                    self.a: act,
                    self.r: rew,
                    self.s_: obs_next,
                    self.done: done,
                    self.return_value: returns,
                    self.important_sampling_weight_ph: weights,
                })

            priorities = priority_values + 1e-6
            self.prioritized_replay_buffer.update_priorities(sample_indices, priorities)
        return loss, montecarlo_loss, np.average(q_eval), np.average(returns)

    def train_normal(self, sess):
        loss, montecarlo_loss, q_eval, returns = 0, 0, 0, 0
        for idx in range(self.update_times_per_train):
            sample_indices = self.replay_buffer.make_index(self.ddpg_batch_size)

            obs, act, rew, obs_next, done, dis_2_end, returns = self.replay_buffer.sample_index(
                sample_indices)

            _, loss, montecarlo_loss, q_eval = sess.run(
                [self._train_ddpg_critic_op, self.loss, self.montecarlo_loss, self.critic_eval],
                feed_dict={
                    self.s: obs,
                    self.a: act,
                    self.r: rew,
                    self.s_: obs_next,
                    self.done: done,
                    self.return_value: returns,
                })
            _, actor_loss = sess.run(
                [self._train_ddpg_a_op, self.actor_loss],
                feed_dict={
                    self.s: obs,
                    self.a: act,
                    self.r: rew,
                    self.s_: obs_next,
                    self.done: done,
                    self.return_value: returns,
                })

        return loss, montecarlo_loss, np.average(q_eval), np.average(returns)
コード例 #2
0
class ConstrainedPPO(CMDPAgent):

    def init_parameters(self, sess):
        if self.has_target_net:
            super(CMDPAgent, self).init_parameters(sess)

            sess.run(self.a_target_replace_op)

    def __init__(self, user_num, n_actions, cvr_n_features, ppo_n_features, init_roi, budget, use_budget_control,
                 use_prioritized_experience_replay,
                 max_trajectory_length,
                 update_times_per_train=1, use_predict_cvr=False):
        self.user_num = user_num
        self.use_budget_control = use_budget_control
        self.update_times_per_train = update_times_per_train
        self.n_actions = n_actions
        self.action_dim = 1
        self.cvr_n_features = cvr_n_features
        self.ppo_n_features = ppo_n_features
        self.lr = 0.001
        self.use_predict_cvr = use_predict_cvr

        self.user_based_adjust_times = 40
        self.epsilon = 0.4
        self.epsilon_min = 0.05

        self.epsilon_dec = 0.1
        self.epsilon_dec_iter = 5000 // self.user_based_adjust_times
        self.epsilon_dec_iter_min = 500 // self.user_based_adjust_times

        self.epsilon_clip = 0.2
        self.lam = 0.5
        self.update_step = 1
        self.kl_target = 0.01
        self.gamma = 1.
        self.method = 'clip'

        self.policy_logvar = 1e-7

        self.replace_target_iter = 1
        self.soft_update_iter = 1
        self.softupdate = False

        self.scope_name = "CPPO-model"

        self.epoch = 0

        self.cvr_buffer_size = 1000 * max_trajectory_length
        self.cvr_batch_size = 512
        self.cvr_replay_buffer = ReplayBuffer(self.cvr_buffer_size, save_return=False)

        self.alpha = 0.6
        self.beta = 0.4
        self.use_prioritized_experience_replay = use_prioritized_experience_replay

        self.ppo_buffer_size = 1000 * max_trajectory_length

        self.ppo_batch_size = 250
        if self.use_prioritized_experience_replay:
            self.prioritized_replay_buffer = PrioritizedReplayBuffer(self.ppo_buffer_size, alpha=self.alpha,
                                                                     max_priority=20.)
        else:
            self.replay_buffer = ReplayBuffer(self.ppo_buffer_size, save_return=True)

        with tf.variable_scope(self.scope_name):

            self._build_net()

            self.build_model_saver(self.scope_name)

    def _build_cvr_net(self, state, variable_scope, reuse=False):
        with tf.variable_scope(variable_scope, reuse=reuse):
            user_id_embedding_table = tf.get_variable(
                name="user_id", shape=[self.user_num, 10], initializer=initializers.xavier_initializer(),
                trainable=True, dtype=tf.float32)
            user_id = tf.cast(state[:, 0], dtype=tf.int32)
            user_id_embeddings = tf.nn.embedding_lookup(user_id_embedding_table, ids=user_id, name="user_id_embedding")
            state = tf.concat([user_id_embeddings, state[:, 1:]], axis=1)
            n_features = state.get_shape()[1]
            fc1 = tf.layers.dense(state, units=n_features, activation=tf.nn.relu, name='fc1',
                                  kernel_initializer=initializers.xavier_initializer())

            fc2 = tf.layers.dense(fc1, units=n_features // 2, activation=tf.nn.relu, name='fc2',
                                  kernel_initializer=initializers.xavier_initializer())

            fc3 = tf.layers.dense(fc2, units=n_features // 2, activation=tf.nn.relu, name='fc3',
                                  kernel_initializer=initializers.xavier_initializer())
            cvr_out = tf.sigmoid(tf.layers.dense(fc3, units=1, name='cvr',
                                                 kernel_initializer=initializers.xavier_initializer()))
            return cvr_out

    def _build_action_net(self, state, variable_scope):
        with tf.variable_scope(variable_scope):
            user_id_embedding_table = tf.get_variable(
                name="user_id", shape=[self.user_num, 10], initializer=initializers.xavier_initializer(),
                trainable=True, dtype=tf.float32)
            user_id = tf.cast(state[:, 0], dtype=tf.int32)
            user_id_embeddings = tf.nn.embedding_lookup(user_id_embedding_table, ids=user_id, name="user_id_embedding")
            state = tf.concat([user_id_embeddings, state[:, 1:]], axis=1)

            n_features = state.get_shape()[1]

            fc1 = tf.layers.dense(state, units=n_features, activation=tf.nn.relu, name='fc1',
                                  kernel_initializer=initializers.xavier_initializer())
            fc2 = tf.layers.dense(fc1, units=n_features // 2, activation=tf.nn.relu, name='fc2',
                                  kernel_initializer=initializers.xavier_initializer())
            fc3 = tf.layers.dense(fc2, units=n_features // 4, activation=tf.nn.relu, name='fc3',
                                  kernel_initializer=initializers.xavier_initializer())
            a_prob = tf.layers.dense(fc3, self.n_actions, tf.nn.softmax,
                                     kernel_initializer=initializers.xavier_initializer())
        return a_prob

    def _build_q_net(self, state, variable_scope, reuse=False):

        with tf.variable_scope(variable_scope, reuse=reuse):
            user_id_embedding_table = tf.get_variable(
                name="user_id", shape=[self.user_num, 10], initializer=initializers.xavier_initializer(),
                trainable=True, dtype=tf.float32)
            user_id = tf.cast(state[:, 0], dtype=tf.int32)
            user_id_embeddings = tf.nn.embedding_lookup(user_id_embedding_table, ids=user_id, name="user_id_embedding")
            state = tf.concat([user_id_embeddings, state[:, 1:]], axis=1)

            n_features = state.get_shape()[1]

            fc1 = tf.layers.dense(state, units=n_features, activation=tf.nn.relu, name='fc1',
                                  kernel_initializer=initializers.xavier_initializer())
            fc2 = tf.layers.dense(fc1, units=n_features // 2, activation=tf.nn.relu, name='fc2',
                                  kernel_initializer=initializers.xavier_initializer())
            fc3 = tf.layers.dense(fc2, units=n_features // 4, activation=tf.nn.relu, name='fc3',
                                  kernel_initializer=initializers.xavier_initializer())
            v = tf.layers.dense(fc3, 1, kernel_initializer=initializers.xavier_initializer())
        return v[:, 0]

    def __make_update_exp__(self, vals, target_vals):
        polyak = 1.0 - 1e-2
        expression = []
        for var, var_target in zip(sorted(vals, key=lambda v: v.name), sorted(target_vals, key=lambda v: v.name)):
            expression.append(var_target.assign(polyak * var_target + (1.0 - polyak) * var))
        expression = tf.group(*expression)
        return expression

    def _build_net(self):

        self.s_cvr = tf.placeholder(tf.float32, [None, self.cvr_n_features], name='s_cvr')
        self.cvr = tf.placeholder(tf.float32, [None, ], name='r')

        self.s = tf.placeholder(tf.float32, [None, self.ppo_n_features], name='s')
        self.s_ = tf.placeholder(tf.float32, [None, self.ppo_n_features], name='s_')
        self.r = tf.placeholder(tf.float32, [None, ], name='r')
        self.a = tf.placeholder(tf.int32, [None, ], name='a')
        self.adv = tf.placeholder(tf.float32, [None, ], name='advantage')
        self.gamma = 1.
        self.done = tf.placeholder(tf.float32, [None, ], name='done')
        self.return_value = tf.placeholder(tf.float32, [None, ], name='return')
        self.important_sampling_weight_ph = tf.placeholder(tf.float32, [None], name="important_sampling_weight")

        self.cvr_net = self._build_cvr_net(self.s_cvr, variable_scope="cvr_net")
        self.predicted_cvr = self.cvr_net[:, 0]
        self.a_eval = self._build_action_net(self.s, variable_scope="actor_eval_net")
        self.a_target = self._build_action_net(self.s, variable_scope="actor_target_net")
        self.critic = self._build_q_net(self.s, variable_scope="eval_q_net")

        ae_params = scope_vars(absolute_scope_name("actor_eval_net"))
        at_params = scope_vars(absolute_scope_name("actor_target_net"))

        e_gmv_params = scope_vars(absolute_scope_name("eval_q_net"))
        cvr_params = scope_vars(absolute_scope_name("cvr_net"))

        with tf.variable_scope('hard_replacement'):
            self.a_target_replace_op = tf.group([tf.assign(t, e) for t, e in zip(at_params, ae_params)])

        with tf.variable_scope('loss'):
            self.cvr_loss = tf.reduce_mean(tf.squared_difference(self.predicted_cvr, self.cvr))

            self._build_loss()

            self._pick_loss()

        with tf.variable_scope('train'):
            self._train_cvr_op = tf.train.AdamOptimizer(self.lr).minimize(self.cvr_loss, var_list=cvr_params)
            self._train_ppo_critic_op = tf.train.AdamOptimizer(self.lr).minimize(self.critic_loss)
            self._train_ppo_actor_op = tf.train.AdamOptimizer(self.lr).minimize(self.actor_loss)

    def _pick_loss(self):
        self.has_target_net = True
        self.critic_loss = self.closs

        self.actor_loss = self.aloss

    def _build_loss(self):
        with tf.variable_scope('critic'):
            self.c_loss = self.return_value - self.critic
            self.closs = tf.reduce_mean(tf.square(self.c_loss))

            self.advantage = self.return_value - self.critic

        with tf.variable_scope('surrogate'):

            a_indices = tf.stack([tf.range(tf.shape(self.a)[0], dtype=tf.int32), self.a], axis=1)
            pi_prob = tf.gather_nd(params=self.a_eval, indices=a_indices)
            oldpi_prob = tf.gather_nd(params=self.a_target, indices=a_indices)
            ratio = pi_prob / (oldpi_prob + 1e-8)
            surr = ratio * self.adv
            if self.method == 'kl_pen':

                kl = tf.distributions.kl_divergence(self.a_target, self.a_eval)
                self.kl_mean = tf.reduce_mean(kl)
                self.aloss = -(tf.reduce_mean(surr - self.lam * kl))
            else:
                self.aloss = -tf.reduce_mean(tf.minimum(
                    surr,
                    tf.clip_by_value(ratio, 1. - self.epsilon_clip, 1. + self.epsilon_clip) * self.adv))

    def build_model_saver(self, var_scope):
        var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=var_scope)

        self.model_saver = tf.train.Saver(var_list=var_list, max_to_keep=1)

    def save(self, sess, path, step):
        if not os.path.exists(os.path.dirname(path)):
            os.makedirs(os.path.dirname(path))
        self.model_saver.save(sess, save_path=path, global_step=step)

    def restore(self, sess, path):
        self.model_saver.restore(sess, save_path=path)
        print('%s model reloaded from %s' % (self.scope_name, path))

    def experience(self, new_trajectory, other_info=None):
        cvr_trajectory = other_info["cvr"]
        for ele in cvr_trajectory:
            state, cvr = ele
            self.cvr_replay_buffer.add(state, 0, cvr, state, 0, 0, 0)

    def experience_cmdp(self, new_trajectory, other_info=None):
        if self.use_prioritized_experience_replay:
            add_episode(self.prioritized_replay_buffer, new_trajectory, gamma=self.gamma)
        else:
            add_episode(self.replay_buffer, new_trajectory, gamma=self.gamma)

    def get_agent_name(self):
        return self.scope_name

    def get_action(self, sess, obs, is_test=False, other_info=None):
        item_price = other_info["proxy_ad_price"]
        ground_truth_cvr = other_info["cvr"]
        user_alpha = other_info["user_alpha"]
        roi_thr = other_info["roi_thr"]

        observations = obs[np.newaxis, :]
        cvr = sess.run(self.predicted_cvr, feed_dict={
            self.s_cvr: observations
        })[0]
        if self.use_predict_cvr:
            bid = cvr * item_price / roi_thr
        else:
            bid = ground_truth_cvr * item_price / roi_thr
        return bid, {"cvr_over_estimate": [user_alpha, ground_truth_cvr, cvr]}

    def get_cmdp_action(self, sess, obs, is_test=False, other_info=None):
        if is_test:
            discrete_action = self.__greedy__(sess, obs)
        else:
            discrete_action = self.__epsilon_greedy__(sess, obs)

        return discrete_action

    def __greedy__(self, sess, observation):
        s = observation[np.newaxis, :]

        prob_weights = sess.run(self.a_eval, feed_dict={self.s: s})
        greedy_action = np.argmax(prob_weights, axis=1)[0]

        return greedy_action

    def __epsilon_greedy__(self, sess, observation):
        if np.random.uniform() < self.epsilon:

            action = np.random.randint(0, self.n_actions)
        else:
            action = self.__greedy__(sess, observation)
        return action

    def _is_exploration_enough(self, buffer, min_pool_size):
        return len(buffer) >= min_pool_size

    def train_cvr(self, sess):
        if not self._is_exploration_enough(self.cvr_replay_buffer, self.cvr_batch_size):
            return False, [0, 0, 0]

        cvr_loss, predicted_cvrs, cvr_targets = 0, 0, 0
        for idx in range(self.update_times_per_train):
            sample_indices = self.cvr_replay_buffer.make_index(self.cvr_batch_size)
            obs, act, cvr_targets, obs_next, done, dis_2_end, returns = self.cvr_replay_buffer.sample_index(
                sample_indices)

            _, cvr_loss, predicted_cvrs = sess.run(
                [self._train_cvr_op, self.cvr_loss, self.predicted_cvr],
                feed_dict={
                    self.s_cvr: obs,
                    self.cvr: cvr_targets
                }
            )
        return True, [cvr_loss, np.average(predicted_cvrs), np.average(cvr_targets)]

    def get_memory_returns(self):
        if self.use_prioritized_experience_replay:
            return self.prioritized_replay_buffer.current_mean_return
        else:
            return self.replay_buffer.current_mean_return

    def update_target(self, sess):
        if self.epoch % self.replace_target_iter == 0:
            sess.run(self.a_target_replace_op)

    def train(self, sess):
        if self.has_target_net:
            self.update_target(sess)
        self.epoch += 1

        buffer = self.prioritized_replay_buffer if self.use_prioritized_experience_replay else self.replay_buffer
        if not self._is_exploration_enough(buffer, self.ppo_batch_size):
            return False, [0, 0, 0, 0], 0, 0

        if self.use_prioritized_experience_replay:

            loss, montecarlo_loss, q_eval, returns = self.train_prioritized(sess)
        else:

            loss, montecarlo_loss, q_eval, returns = self.train_normal(sess)

        if self.epoch % self.epsilon_dec_iter == 0:
            self.epsilon = max(self.epsilon - self.epsilon_dec, self.epsilon_min)

            print("update epsilon:", self.epsilon)
        return True, [loss, montecarlo_loss, q_eval, returns], self.get_memory_returns(), self.epsilon

    def train_prioritized(self, sess):
        loss, montecarlo_loss, q_eval, returns = 0, 0, 0, 0
        for idx in range(self.update_times_per_train):
            sample_indices = self.prioritized_replay_buffer.make_index(self.ppo_batch_size)
            obs, act, rew, obs_next, done, dis_2_end, returns, weights, ranges = self.prioritized_replay_buffer.sample_index(
                sample_indices)
            _, loss, montecarlo_loss, q_eval, \
            priority_values = sess.run(
                [self._train_ppo_op, self.loss, self.montecarlo_loss, self.q_eval_wrt_a,
                 self.priority_values],
                feed_dict={
                    self.s: obs,
                    self.a: act,
                    self.r: rew,
                    self.s_: obs_next,
                    self.done: done,
                    self.return_value: returns,
                    self.important_sampling_weight_ph: weights,
                })

            priorities = priority_values + 1e-6
            self.prioritized_replay_buffer.update_priorities(sample_indices, priorities)
        return loss, montecarlo_loss, np.average(q_eval), np.average(returns)

    def train_normal(self, sess):
        loss, montecarlo_loss, q_eval, returns = 0, 0, 0, 0
        for idx in range(self.update_times_per_train):

            sample_indices = self.replay_buffer.make_index(self.ppo_batch_size)

            obs, act, rew, obs_next, done, dis_2_end, returns = self.replay_buffer.sample_index(
                sample_indices)

            adv = sess.run(self.advantage, {self.s: obs, self.return_value: returns})

            _, montecarlo_loss, q_eval = sess.run(
                [self._train_ppo_critic_op, self.critic_loss, self.critic],
                feed_dict={
                    self.s: obs,
                    self.a: act,
                    self.adv: adv,
                    self.r: rew,
                    self.s_: obs_next,
                    self.done: done,
                    self.return_value: returns,
                })
            if self.method == 'kl_pen':
                for _ in range(self.update_step):
                    _, kl, loss = sess.run(
                        [self._train_ppo_actor_op, self.kl_mean, self.actor_loss],
                        feed_dict={
                            self.adv: adv,
                            self.s: obs,
                            self.a: act,
                            self.r: rew,
                            self.done: done,
                        })
                    if kl > 4 * self.kl_target:
                        break
                if kl < self.kl_target / 1.5:
                    self.lam /= 2
                elif kl > self.kl_target * 1.5:
                    self.lam *= 2
                self.lam = np.clip(self.lam, 1e-4, 10)
            else:

                for _ in range(self.update_step):
                    _, loss = sess.run(
                        [self._train_ppo_actor_op, self.actor_loss],
                        feed_dict={
                            self.adv: adv,
                            self.s: obs,
                            self.a: act,
                            self.r: rew,
                            self.done: done,
                            self.return_value: returns,

                        })

        return loss, montecarlo_loss, np.average(q_eval), np.average(returns)
コード例 #3
0
ファイル: Agent.py プロジェクト: karunraju/NFF
class Agent():
    def __init__(self, render=False, method='Duel'):

        # Create an instance of the network itself, as well as the memory.
        # Here is also a good place to set environmental parameters,
        # as well as training parameters - number of episodes / iterations, etc.
        self.render = render
        if render:
            self.env = gym.make('NEL-render-v0')
        else:
            self.env = gym.make('NEL-v0')
        #self.test_env = gym.make('NEL-v0')
        self.an = self.env.action_space.n  # No. of actions in env
        self.epsilon = 0.5
        self.training_time = PARAM.TRAINING_TIME  # Training Time
        self.df = PARAM.DISCOUNT_FACTOR  # Discount Factor
        self.batch_size = PARAM.BATCH_SIZE
        self.method = method
        self.test_curr_state = None
        self.log_time = 100.0
        self.test_time = 1000.0
        self.prioritized_replay = PARAM.PRIORITIZED_REPLAY
        self.prioritized_replay_eps = 1e-6
        #self.prioritized_replay_alpha = 0.6
        self.prioritized_replay_alpha = 0.8
        self.prioritized_replay_beta0 = 0.4
        self.burn_in = PARAM.BURN_IN

        # Create Replay Memory and initialize with burn_in transitions
        if self.prioritized_replay:
            self.replay_buffer = PrioritizedReplayBuffer(
                PARAM.REPLAY_MEMORY_SIZE, alpha=self.prioritized_replay_alpha)
            self.beta_schedule = LinearSchedule(
                float(self.training_time),
                initial_p=self.prioritized_replay_beta0,
                final_p=1.0)
        else:
            self.replay_buffer = ReplayBuffer(PARAM.REPLAY_MEMORY_SIZE)
            self.beta_schedule = None

        # Create QNetwork instance
        if self.method == 'Duel':
            print('Using Duel Network.')
            self.net = DuelQNetwork(self.an)
        elif self.method == 'DoubleQ':
            print('Using DoubleQ Network.')
            self.net = DoubleQNetwork(self.an)
        else:
            raise NotImplementedError

        cur_dir = os.getcwd()
        self.dump_dir = cur_dir + '/tmp_' + self.method + '_' + time.strftime(
            "%Y%m%d-%H%M%S") + '/'
        # Create output directory
        if not os.path.exists(self.dump_dir):
            os.makedirs(self.dump_dir)
        self.train_file = open(self.dump_dir + 'train_rewards.txt', 'w')
        self.test_file = open(self.dump_dir + 'test_rewards.txt', 'w')

    def update_epsilon(self):
        ''' Epsilon decay from 0.5 to 0.05 over 100000 iterations. '''
        if self.epsilon <= 0.05:
            self.epsilon = 0.05
            return

        self.epsilon = self.epsilon - (0.5 - 0.1) / 200000.0

    def epsilon_greedy_policy(self, q_values, epsilon):
        # Creating epsilon greedy probabilities to sample from.
        val = np.random.rand(1)
        if val <= epsilon:
            return np.random.randint(q_values.shape[1])
        return np.argmax(q_values)

    def greedy_policy(self, q_values):
        # Creating greedy policy for test time.
        return np.argmax(q_values)

    def train(self):
        train_rewards = []
        test_rewards = []
        count = 0
        steps = 0
        test_steps = 0

        cum_reward = 0.0
        elapsed = 0.0

        curr_state = self.env.reset()
        curr_state = self.burn_in_memory(curr_state)
        prev_action = -1
        if self.render:
            self.env.render()
        for i in range(self.training_time):
            # Get q_values based on the current state
            Vt, St = self.get_input_tensor(curr_state)
            q_values = self.net.get_Q_output(Vt, St)

            # Selecting an action based on the policy
            action = self.epsilon_greedy_policy(q_values, self.epsilon)
            #if not curr_state['moved'] and action == prev_action and self.epsilon > 0.1:
            #  action = self.epsilon_greedy_policy(q_values, 0.5)

            # Executing action in simulator
            nextstate, reward, _, _ = self.env.step(action)
            steps = steps + 1
            test_steps = test_steps + 1
            if self.render:
                self.env.render()

            # Store Transition
            if nextstate['moved'] or prev_action != action:
                self.replay_buffer.add(curr_state, action, reward / 100.0,
                                       nextstate, 0)
            prev_action = action

            # Sample random minibatch from experience replay
            if self.prioritized_replay:
                batch, weights, batch_idxes = self.replay_buffer.sample(
                    self.batch_size, beta=self.beta_schedule.value(i))
            else:
                batch = self.replay_buffer.sample(self.batch_size)
                weights, batch_idxes = np.ones(self.batch_size), None

            # Train the Network with mini batches
            xVT, xST = self.get_input_tensors(batch)
            yT = self.get_output_tensors(batch)

            # Mask to select the actions from the Q network output
            mT = torch.zeros(self.batch_size, self.an, dtype=torch.uint8)
            for k, tran in enumerate(batch):
                mT[k, tran[1]] = 1
            td_errors = self.net.train(xVT, xST, yT, mT, weights)

            if self.prioritized_replay:
                #new_priorities = np.abs(td_errors) + self.prioritized_replay_eps
                #new_priorities = []
                #for i, tran in enumerate(batch):
                #  new_priorities.append(tran[2] + self.prioritized_replay_eps)
                self.replay_buffer.update_priorities(batch_idxes, weights)

            # Decay epsilon
            self.update_epsilon()

            cum_reward += reward
            curr_state = nextstate

            if steps == 100:
                cum_reward = cum_reward / float(self.log_time)
                train_rewards.append(cum_reward)
                self.train_file.write(str(cum_reward))
                self.train_file.write('\n')
                self.train_file.flush()
                cum_reward = 0.0
                print('Train Reward: %.4f' % (train_rewards[-1]))
                steps = 0

                x = list(range(len(train_rewards)))
                plt.plot(x, train_rewards, '-bo')
                plt.xlabel('Time')
                plt.ylabel('Average Reward')
                plt.title('Training Curve')
                plt.savefig(self.dump_dir + 'Training_Curve_' + self.method +
                            '.png')
                plt.close()

                plot(self.dump_dir + self.method, train_rewards)


#      if test_steps == 500:
#        self.net.set_eval()
#        test_rewards.append(self.test())
#        self.test_file.write(str(test_rewards[-1]))
#        self.test_file.write('\n')
#        self.test_file.flush()
#        self.net.set_train()
#        count = count + 1
#        print('\nTest Reward: %.4f\n' % (test_rewards[-1]))
#        test_steps = 0
#
#        x = list(range(len(test_rewards)))
#        plt.plot(x, test_rewards, '-bo')
#        plt.xlabel('Time')
#        plt.ylabel('Average Reward')
#        plt.title('Testing Curve')
#        plt.savefig(self.dump_dir + 'Testing_Curve_' + self.method + '.png')
#        plt.close()

            if count > 0 and count % 30 == 0:
                self.net.save_model_weights(count, self.dump_dir)

    def test(self, testing_steps=100, model_file=None, capture=False):
        if model_file is not None:
            self.net.load_model(model_file)

        if capture:
            self.test_env = gym.wrappers.Monitor(self.test_env, './')

        epsilon = 0.05
        rewards = []

        self.test_curr_state = self.test_env.reset()
        #if self.render:
        #  self.test_env.render()
        cum_reward = 0.0
        for i in range(testing_steps):
            # Initializing the episodes
            Vt, St = self.get_input_tensor(self.test_curr_state)
            q_values = self.net.get_Q_output(Vt, St)
            action = self.epsilon_greedy_policy(q_values, epsilon)

            # Executing action in simulator
            nextstate, reward, _, _ = self.test_env.step(action)
            #if self.render:
            #  self.test_env.render()

            cum_reward += reward
            self.test_curr_state = nextstate
        avg_reward = cum_reward / float(testing_steps)
        rewards.append(avg_reward)

        return avg_reward

    def burn_in_memory(self, curr_state):
        # Initialize your replay memory with a burn_in number of episodes / transitions.
        cnt = 0
        while self.burn_in > cnt:
            # Randomly selecting action for burn in. Not sure if this is correct.
            action = self.env.action_space.sample()
            next_state, reward, _, _ = self.env.step(action)

            self.replay_buffer.add(curr_state, action, reward / 100.0,
                                   next_state, 0)

            curr_state = next_state
            cnt = cnt + 1
        return curr_state

    def get_input_tensor(self, obs):
        ''' Returns an input tensor from the observation. '''
        iV = np.zeros((1, 3, 11, 11))
        iS = np.zeros((1, 4))

        iV[0] = np.moveaxis(obs['vision'], -1, 0)
        iS[0] = np.concatenate((obs['scent'], np.array([int(obs['moved'])])),
                               axis=0)
        iVt, iSt = torch.from_numpy(iV).float(), torch.from_numpy(iS).float()
        return iVt, iSt

    def get_input_tensors(self, batch, next_state=False):
        ''' Returns an input tensor created from the sampled batch. '''
        V = np.zeros((self.batch_size, 3, 11, 11))
        S = np.zeros((self.batch_size, 4))
        for i, tran in enumerate(batch):
            if next_state:
                obs = tran[3]  # next state
            else:
                obs = tran[0]  # current state

            V[i] = np.moveaxis(obs['vision'], -1, 0)
            S[i] = np.concatenate(
                (obs['scent'], np.array([int(obs['moved'])])), axis=0)
        Vt, St = torch.from_numpy(V).float(), torch.from_numpy(S).float()
        return Vt, St

    def get_output_tensors(self, batch):
        ''' Returns an output tensor created from the sampled batch. '''
        Y = np.zeros(self.batch_size)
        Vt, St = self.get_input_tensors(batch, next_state=True)
        q_values_a = self.net.get_Q_output(Vt, St)
        q_values_e = self.net.get_target_output(Vt, St)
        for i, tran in enumerate(batch):
            action = self.greedy_policy(q_values_a[i])
            Y[i] = tran[2] + self.df * q_values_e[i][action]

        Yt = torch.from_numpy(Y).float()
        return Yt
コード例 #4
0
class Agent_ppo():
    def __init__(self, render=False):

        # Create an instance of the network itself, as well as the memory.
        # Here is also a good place to set environmental parameters,
        # as well as training parameters - number of episodes / iterations, etc.
        self.render = render
        if render:
            self.env = gym.make('NEL-render-v0')
        else:
            self.env = gym.make('NEL-v0')
        #self.test_env = gym.make('NEL-v0')
        self.an = self.env.action_space.n  # No. of actions in env
        self.training_time = PARAM.TRAINING_TIME  # Training Time
        self.method = 'PPO'
        self.test_curr_state = None
        self.log_time = 100.0
        self.test_time = 1000.0

        self.burn_in = PARAM.BURN_IN
        self.tmax = PARAM.A2C_EPISODE_SIZE_MAX
        self.tmin = PARAM.A2C_EPISODE_SIZE_MIN
        self.seq_len = PARAM.A2C_SEQUENCE_LENGTH
        self.replay_buffer = ReplayBuffer(PARAM.REPLAY_MEMORY_SIZE)
        self.episode_buffer = [[]] * self.tmax
        self.net = PPO(self.episode_buffer, self.replay_buffer)

        cur_dir = os.getcwd()
        self.dump_dir = cur_dir + '/tmp_' + self.method + '_' + time.strftime(
            "%Y%m%d-%H%M%S") + '/'
        # Create output directory
        if not os.path.exists(self.dump_dir):
            os.makedirs(self.dump_dir)
        self.train_file = open(self.dump_dir + 'train_rewards.txt', 'w')
        self.test_file = open(self.dump_dir + 'test_rewards.txt', 'w')

        self.curr_state = self.env.reset()
        self.tong_count = 0
        self.curr_state = self.burn_in_memory(self.curr_state)
        self.train_rewards = []
        self.test_rewards = []
        self.steps = 0
        self.cum_reward = 0.0
        self.save_count = 0

    def generate_episode(self, tmax, render=False):
        #for i in range(tmax):
        ctr, i = (0, 0)
        self.her_reward_buffer = np.zeros(tmax)
        her_reward = 0
        while ctr < tmax:
            if i % PARAM.ACTION_REPEAT == 0:
                val, softmax, action = self.net.get_output(
                    [ctr - 1], seq_len=self.seq_len, batch_size=1)
            else:
                action = 0

            next_state, reward, _, _ = self.env.step(action)
            if render:
                self.env.render()

            if PARAM.REWARD_SHAPING:
                psuedo_reward = self.compute_psuedo_reward(
                    next_state['vision'])
            else:
                psuedo_reward = 0.0

            tong_reward = 0.0
            if reward == 0:
                if self.curr_state['vision'][5, 6, 0] == 1.0:
                    self.tong_count += 1
                    if PARAM.REWARD_SHAPING:
                        tong_reward = 10.0
            elif reward == 100.0:
                self.tong_count -= 1

            her_reward += reward
            if i % PARAM.ACTION_REPEAT == 0:
                self.episode_buffer[ctr] = (self.curr_state, action,
                                            ((reward + tong_reward) / 100.0 +
                                             psuedo_reward), next_state,
                                            softmax, self.tong_count, val)
                self.her_reward_buffer[ctr] = her_reward
                her_reward = 0
                ctr += 1
            self.replay_buffer.add(self.curr_state, action, reward / 100.0,
                                   next_state, 0, self.tong_count)
            self.curr_state = next_state

            i += 1
            self.steps += 1
            self.cum_reward += reward
            if self.steps % 100 == 0:
                self.plot_train_stats()

    def compute_psuedo_reward(self, vision):
        avg = np.mean(vision[3:8, 3:8, :], axis=2)
        idxs = avg == 0.5
        avg[idxs] = 0.0
        reward = np.sum(avg) - 1.0 / 3.0
        if reward < 0.001:
            return 0.0

        return reward

    def hind_sight_experience_replay(self, episode_len):
        her_reward = 0
        her_decay = PARAM.HER_DECAY
        for i in range(episode_len - 1, -1, -1):
            obs, action, reward, next_obs, softmax, tong_count, val = self.episode_buffer[
                i]
            self.episode_buffer[i] = (
                obs, action,
                (self.her_reward_buffer[i] + her_reward * her_decay) / 100.0,
                next_obs, softmax, tong_count, val)
            her_reward = her_reward * her_decay + self.her_reward_buffer[i]

    def train(self):
        for i in range(self.training_time):
            self.net.set_train()
            episode_len = np.random.randint(self.tmin, self.tmax + 1)
            self.generate_episode(episode_len, self.render)
            if PARAM.HER:
                self.hind_sight_experience_replay(episode_len)
            self.net.train(episode_len)
            self.save_count += 1

    def test(self, testing_steps=100, model_file=None):
        if model_file is not None:
            self.net.load_model(model_file)

        self.net.set_eval()
        cum_reward = 0.0
        for i in range(testing_steps):
            softmax, action = self.net.get_output(self.curr_state, i)
            _, reward, _, _ = self.test_env.step(action)
            cum_reward += reward

        self.test_reward.append(cum_reward)
        self.test_file.write(str(test_rewards[-1]))
        self.test_file.write('\n')
        self.test_file.flush()
        print('\nTest Reward: %.4f\n' % (test_rewards[-1]))
        test_steps = 0

        x = list(range(len(test_rewards)))
        plt.plot(x, self.test_rewards, '-bo')
        plt.xlabel('Time')
        plt.ylabel('Average Reward')
        plt.title('Testing Curve')
        plt.savefig(self.dump_dir + 'Testing_Curve_' + self.method + '.png')
        plt.close()

    def plot_train_stats(self):
        self.cum_reward = self.cum_reward / float(self.log_time)
        self.train_rewards.append(self.cum_reward)
        self.train_file.write(str(self.cum_reward))
        self.train_file.write('\n')
        self.train_file.flush()
        self.cum_reward = 0.0
        if self.train_rewards[-1] > 0:
            self.net.A.save("checkpoint.pth")
            print('[%d] Train Reward: %.4f' %
                  (len(self.train_rewards), self.train_rewards[-1]))
        self.steps = 0

        x = list(range(len(self.train_rewards)))
        plt.plot(x, self.train_rewards, '-bo')
        plt.xlabel('Time')
        plt.ylabel('Average Reward')
        plt.title('Training Curve')
        plt.savefig(self.dump_dir + 'Training_Curve_' + self.method + '.png')
        plt.close()

        plot(self.dump_dir + self.method, self.train_rewards)


#    if self.save_count > 0 and self.save_count % 500 == 0:
#      self.net.save_model_weights(self.save_count, self.dump_dir)

    def burn_in_memory(self, curr_state):
        # Initialize your replay memory with a burn_in number of episodes / transitions.
        cnt = 0
        while self.burn_in > cnt:
            action = self.env.action_space.sample()
            next_state, reward, _, _ = self.env.step(action)
            if reward == 20.0:
                self.tong_count += 1
            elif reward == 100.0:
                self.tong_count -= 1
            self.replay_buffer.add(curr_state, action, reward / 100.0,
                                   next_state, 0, self.tong_count)
            curr_state = next_state

            cnt = cnt + 1
        return curr_state
コード例 #5
0
class ContextualBandit(PIDAgent, CvrAgent):
    def __init__(
        self,
        user_num,
        n_features,
        init_roi,
        budget,
        use_budget_control,
        max_trajectory_length,
        update_times_per_train=1,
    ):
        PIDAgent.__init__(self,
                          init_roi=init_roi,
                          default_alpha=1,
                          budget=budget,
                          integration=1)
        self.user_num = user_num
        self.use_budget_control = use_budget_control
        self.update_times_per_train = update_times_per_train
        self.n_actions = 1
        self.n_features = n_features
        self.lr = 0.001

        self.scope_name = "MyopicGreedy-model"

        self.epoch = 0

        self.buffer_size = 1000 * max_trajectory_length
        self.batch_size = 512
        self.replay_buffer = ReplayBuffer(self.buffer_size, save_return=False)

        with tf.variable_scope(self.scope_name):
            self._build_net()

            self.build_model_saver(self.scope_name)

    def _build_cvr_net(self, state, variable_scope, reuse=False):
        with tf.variable_scope(variable_scope, reuse=reuse):
            user_id_embedding_table = tf.get_variable(
                name="user_id",
                shape=[self.user_num, 10],
                initializer=initializers.xavier_initializer(),
                trainable=True,
                dtype=tf.float32)
            user_id = tf.cast(state[:, 0], dtype=tf.int32)
            user_id_embeddings = tf.nn.embedding_lookup(
                user_id_embedding_table, ids=user_id, name="user_id_embedding")
            state = tf.concat([user_id_embeddings, state[:, 1:]], axis=1)
            n_features = state.get_shape()[1]
            fc1 = tf.layers.dense(state,
                                  units=n_features,
                                  activation=tf.nn.relu,
                                  name='fc1')

            cvr_out = tf.sigmoid(tf.layers.dense(fc1, units=1, name='cvr'))
            return cvr_out

    def _build_net(self):

        self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s')
        self.cvr = tf.placeholder(tf.float32, [
            None,
        ], name='r')

        self.cvr_net = self._build_cvr_net(self.s, variable_scope="cvr_net")
        self.predicted_cvr = self.cvr_net[:, 0]

        cvr_params = scope_vars(absolute_scope_name("cvr_net"))

        with tf.variable_scope('loss'):
            self.cvr_loss = tf.reduce_mean(
                tf.squared_difference(self.predicted_cvr, self.cvr))

        with tf.variable_scope('train'):
            self._train_op = tf.train.AdamOptimizer(self.lr).minimize(
                self.cvr_loss, var_list=cvr_params)

    def build_model_saver(self, var_scope):
        var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                     scope=var_scope)

        self.model_saver = tf.train.Saver(var_list=var_list, max_to_keep=1)

    def save(self, sess, path, step):
        if not os.path.exists(os.path.dirname(path)):
            os.makedirs(os.path.dirname(path))
        self.model_saver.save(sess, save_path=path, global_step=step)

    def restore(self, sess, path):
        self.model_saver.restore(sess, save_path=path)
        print('%s model reloaded from %s' % (self.scope_name, path))

    def experience(self, new_trajectory, other_info=None):
        cvr_trajectory = other_info["cvr"]
        for ele in cvr_trajectory:
            state, cvr = ele
            self.replay_buffer.add(state, 0, cvr, state, 0, 0, 0)

    def get_action(self, sess, obs, is_test=False, other_info=None):
        item_price = other_info["proxy_ad_price"]
        ground_truth_cvr = other_info["cvr"]
        user_alpha = other_info["user_alpha"]
        if self.use_budget_control:
            roi_thr = self.get_roi_threshold()
        else:
            roi_thr = self.init_roi

        observations = obs[np.newaxis, :]
        cvr = sess.run(self.predicted_cvr, feed_dict={self.s: observations})[0]

        bid = ground_truth_cvr * item_price / roi_thr
        return bid, {"cvr_over_estimate": [user_alpha, ground_truth_cvr, cvr]}

    def _is_exploration_enough(self, min_pool_size):
        return len(self.replay_buffer) >= min_pool_size

    def train(self, sess):
        self.epoch += 1

        if not self._is_exploration_enough(self.batch_size):
            return False, [0, 0, 0]

        cvr_loss, predicted_cvrs, cvr_targets = 0, 0, 0
        for idx in range(self.update_times_per_train):
            sample_indices = self.replay_buffer.make_index(self.batch_size)
            obs, act, cvr_targets, obs_next, done, dis_2_end, returns = self.replay_buffer.sample_index(
                sample_indices)

            _, cvr_loss, predicted_cvrs = sess.run(
                [self._train_op, self.cvr_loss, self.predicted_cvr],
                feed_dict={
                    self.s: obs,
                    self.cvr: cvr_targets
                })
        return True, [
            cvr_loss,
            np.average(predicted_cvrs),
            np.average(cvr_targets)
        ]