def __init__(self,
              T,
              S,
              layers,
              n_comp,
              batch_size,
              C=1.,
              data_dim=3,
              keep_latest_k=None,
              lr=1e-3,
              reg_scale=0.):
     """
     Params:
     - T: the maximum time of the sequences
     - S: the space of location
     - C: the constant in diffusion kernel
     - batch_size:    batch size of the training data
     - maximum:       upper bound of the conditional intensity
     - data_dim:      data dimension (=3 by default)
     - keep_latest_k: only compute latest k points in log-likelihood calculation
     - lr:            learning rate for the SGD optimizer
     """
     self.batch_size = batch_size
     # Hawkes process
     self.hawkes = SpatialTemporalHawkes(T,
                                         S,
                                         layers=layers,
                                         n_comp=n_comp,
                                         C=C,
                                         maximum=1e+3,
                                         verbose=False)
     # regularization
     l1_regularizer = tf.contrib.layers.l1_regularizer(scale=reg_scale,
                                                       scope=None)
     penalty_term = tf.contrib.layers.apply_regularization(
         l1_regularizer, self.hawkes.Wss)
     # input tensors: expert sequences (time, location, marks)
     self.input_seqs = tf.placeholder(
         tf.float32,
         [batch_size, None, data_dim])  # [batch_size, seq_len, data_dim]
     self.cost = -1 * self.log_likelihood(
         S, keep_latest_k=keep_latest_k) / batch_size  # + penalty_term
     # Adam optimizer
     global_step = tf.Variable(0, trainable=False)
     learning_rate = tf.train.exponential_decay(lr,
                                                global_step,
                                                decay_steps=100,
                                                decay_rate=0.99,
                                                staircase=True)
     self.optimizer = tf.train.AdamOptimizer(learning_rate,
                                             beta1=0.6,
                                             beta2=0.9).minimize(
                                                 self.cost,
                                                 global_step=global_step)
Esempio n. 2
0
 def __init__(self, T, S, C=1., maximum=1e+4):
     """
     Params:
     - T: the maximum time of the sequences
     - S: the space of location
     - C: the constant in diffusion kernel
     """
     # model hyper-parameters
     self.T = T  # maximum time
     self.S = S  # location space
     # Hawkes process generator
     self.hawkes = SpatialTemporalHawkes(C=C, maximum=maximum)
Esempio n. 3
0
 def __init__(self,
              T,
              S,
              layers,
              n_comp,
              batch_size,
              C=1.,
              maximum=1e+3,
              keep_latest_k=None,
              lr=1e-5,
              eps=0.2):
     """
     Params:
     - T: the maximum time of the sequences
     - S: the space of location
     - C: the constant in diffusion kernel
     """
     # model hyper-parameters
     self.T = T  # time space
     self.S = S  # location space
     self.batch_size = batch_size  # batch size
     self.maximum = maximum  # upper bound of the conditional intensity
     # Hawkes process generator
     self.hawkes = SpatialTemporalHawkes(T,
                                         S,
                                         layers=layers,
                                         n_comp=n_comp,
                                         C=C,
                                         maximum=1e+3,
                                         verbose=False)
     # input tensors: expert sequences (time, location)
     self.input_expert_seqs = tf.placeholder(tf.float32,
                                             [batch_size, None, 3])
     self.input_learner_seqs = tf.placeholder(tf.float32,
                                              [batch_size, None, 3])
     # TODO: make esp decay exponentially
     # coaching
     # self.coached_learner_seqs = self._coaching(self.input_learner_seqs, self.input_expert_seqs, eps=eps)
     self.learner_seqs_loglik = self._log_likelihood(
         learner_seqs=self.input_learner_seqs, keep_latest_k=keep_latest_k)
     # build policy optimizer
     self._policy_optimizer(expert_seqs=self.input_expert_seqs,
                            learner_seqs=self.input_learner_seqs,
                            learner_seqs_loglik=self.learner_seqs_loglik,
                            lr=lr)
class MLE_Hawkes_Generator(object):
    """
    Reinforcement Learning Based Point Process Generator
    """
    def __init__(self,
                 T,
                 S,
                 layers,
                 n_comp,
                 batch_size,
                 C=1.,
                 data_dim=3,
                 keep_latest_k=None,
                 lr=1e-3,
                 reg_scale=0.):
        """
        Params:
        - T: the maximum time of the sequences
        - S: the space of location
        - C: the constant in diffusion kernel
        - batch_size:    batch size of the training data
        - maximum:       upper bound of the conditional intensity
        - data_dim:      data dimension (=3 by default)
        - keep_latest_k: only compute latest k points in log-likelihood calculation
        - lr:            learning rate for the SGD optimizer
        """
        self.batch_size = batch_size
        # Hawkes process
        self.hawkes = SpatialTemporalHawkes(T,
                                            S,
                                            layers=layers,
                                            n_comp=n_comp,
                                            C=C,
                                            maximum=1e+3,
                                            verbose=False)
        # regularization
        l1_regularizer = tf.contrib.layers.l1_regularizer(scale=reg_scale,
                                                          scope=None)
        penalty_term = tf.contrib.layers.apply_regularization(
            l1_regularizer, self.hawkes.Wss)
        # input tensors: expert sequences (time, location, marks)
        self.input_seqs = tf.placeholder(
            tf.float32,
            [batch_size, None, data_dim])  # [batch_size, seq_len, data_dim]
        self.cost = -1 * self.log_likelihood(
            S, keep_latest_k=keep_latest_k) / batch_size  # + penalty_term
        # Adam optimizer
        global_step = tf.Variable(0, trainable=False)
        learning_rate = tf.train.exponential_decay(lr,
                                                   global_step,
                                                   decay_steps=100,
                                                   decay_rate=0.99,
                                                   staircase=True)
        self.optimizer = tf.train.AdamOptimizer(learning_rate,
                                                beta1=0.6,
                                                beta2=0.9).minimize(
                                                    self.cost,
                                                    global_step=global_step)

    def log_likelihood(self, S, keep_latest_k):
        """
        compute the log-likelihood of the input data given the hawkes point process. 
        """
        # log-likelihood
        loglikli = 0.
        for b in range(batch_size):
            seq = self.input_seqs[b, :, :]
            mask_t = tf.cast(seq[:, 0] > 0, tf.float32)
            trunc_seq = tf.boolean_mask(seq, mask_t)
            seq_len = tf.shape(trunc_seq)[0]
            # calculate the log conditional pdf for each of data points in the sequence.
            loglikli += tf.reduce_sum(
                tf.scan(
                    lambda a, i: self.hawkes.log_conditional_pdf(
                        trunc_seq[:i, :], keep_latest_k=keep_latest_k),
                    tf.range(1, seq_len +
                             1),  # from the first point to the last point
                    initializer=np.array(0., dtype=np.float32)))
        return loglikli

    def train(
            self,
            sess,
            epoches,  # number of epoches (how many times is the entire dataset going to be trained)
            expert_seqs,  # [n, seq_len, data_dim=3]
            pretrained=False):
        """train the point process generator given expert sequences."""

        # initialization
        if not pretrained:
            # initialize network parameters
            init_op = tf.global_variables_initializer()
            sess.run(init_op)
            print("[%s] parameters are initialized." % arrow.now(),
                  file=sys.stderr)

        # data configurations
        # - number of expert sequences
        n_data = expert_seqs.shape[0]
        # - number of batches
        n_batches = int(n_data / batch_size)

        # training over epoches
        for epoch in range(epoches):
            # shuffle indices of the training samples
            shuffled_ids = np.arange(n_data)
            np.random.shuffle(shuffled_ids)

            # training over batches
            avg_train_cost = []
            for b in range(n_batches):
                idx = np.arange(batch_size * b, batch_size * (b + 1))
                # training and testing indices selected in current batch
                batch_train_ids = shuffled_ids[idx]
                # training and testing batch data
                batch_train_seqs = expert_seqs[batch_train_ids, :, :]
                # optimization procedure
                sess.run(self.optimizer,
                         feed_dict={self.input_seqs: batch_train_seqs})
                # cost for train batch and test batch
                train_cost = sess.run(
                    self.cost, feed_dict={self.input_seqs: batch_train_seqs})
                print("[%s] batch training cost: %.2f." %
                      (arrow.now(), train_cost),
                      file=sys.stderr)
                # record cost for each batch
                avg_train_cost.append(train_cost)

            # training log output
            avg_train_cost = np.mean(avg_train_cost)
            print('[%s] Epoch %d (n_train_batches=%d, batch_size=%d)' %
                  (arrow.now(), epoch, n_batches, batch_size),
                  file=sys.stderr)
            print('[%s] Training cost:\t%f' % (arrow.now(), avg_train_cost),
                  file=sys.stderr)
Esempio n. 5
0
class RL_Hawkes_Generator(object):
    """
    Reinforcement Learning Based Point Process Generator
    """
    def __init__(self, T, S, C=1., maximum=1e+4):
        """
        Params:
        - T: the maximum time of the sequences
        - S: the space of location
        - C: the constant in diffusion kernel
        """
        # model hyper-parameters
        self.T = T  # maximum time
        self.S = S  # location space
        # Hawkes process generator
        self.hawkes = SpatialTemporalHawkes(C=C, maximum=maximum)

    def _rebulid_policy_optimizer(self, sess, batch_size, lr=1e-2):
        """
        """
        # generated tensors: learner sequences (time, location, loglikelihood)
        learner_seq_t, learner_seq_l, learner_seq_loglik = self.hawkes.get_learner_seqs(
            sess, self.T, self.S, batch_size)

        # concatenate batches in the sequences
        expert_seq_t,  expert_seq_l = \
            self.__concatenate_batch(self.input_seq_t), \
            self.__concatenate_batch(self.input_seq_l)
        learner_seq_t, learner_seq_l, learner_seq_loglik = \
            self.__concatenate_batch(learner_seq_t), \
            self.__concatenate_batch(learner_seq_l), \
            self.__concatenate_batch(learner_seq_loglik)
        print("[%s] rebuiding reward." % arrow.now(), file=sys.stderr)
        # calculate average rewards
        reward = self._reward(batch_size, self.T[0], self.T[1],\
                              expert_seq_t,  expert_seq_l, learner_seq_t, learner_seq_l) # [batch_size*seq_len, 1]
        print("[%s] rebuiding optimizer." % arrow.now(), file=sys.stderr)
        # cost and optimizer
        self.cost = tf.reduce_sum(tf.multiply(reward, learner_seq_loglik),
                                  axis=0) / batch_size
        # global_step    = tf.Variable(0, trainable=False)
        # learning_rate  = tf.train.exponential_decay(starter_learning_rate, global_step, decay_step, decay_rate, staircase=True)
        # self.optimizer = tf.train.AdamOptimizer(learning_rate, beta1=0.6, beta2=0.9).minimize(self.cost, global_step=global_step)
        self.optimizer = tf.train.GradientDescentOptimizer(lr).minimize(
            self.cost)

    def _reward(
            self,
            batch_size,
            t0,
            T,
            expert_seq_t,
            expert_seq_l,  # expert sequences
            learner_seq_t,
            learner_seq_l,  # learner sequences
            kernel_bandwidth=0.5):
        """reward function"""
        # get mask for concatenated expert and learner sequences
        expert_seq_mask = self.__get_mask_truncate_by_T(
            expert_seq_t, T, t0)  # [batch_size*seq_len, 1]
        learner_seq_mask = self.__get_mask_truncate_by_T(
            learner_seq_t, T, t0)  # [batch_size*seq_len, 1]
        # calculate mask for kernel matrix
        learner_learner_kernel_mask = tf.matmul(learner_seq_mask,
                                                tf.transpose(learner_seq_mask))
        expert_learner_kernel_mask = tf.matmul(expert_seq_mask,
                                               tf.transpose(learner_seq_mask))
        # concatenate each data dimension for both expert sequence and learner sequence
        # TODO: Add mark to the sequences
        # expert_seq  = tf.concat([expert_seq_t, expert_seq_l], axis=1)   # [batch_size*seq_len, t_dim+l_dim+m_dim]
        # learner_seq = tf.concat([learner_seq_t, learner_seq_l], axis=1) # [batch_size*seq_len, t_dim+l_dim+m_dim]
        expert_seq = tf.concat([expert_seq_l],
                               axis=1)  # [batch_size*seq_len, t_dim]
        learner_seq = tf.concat([learner_seq_l],
                                axis=1)  # [batch_size*seq_len, t_dim]
        # calculate upper-half kernel matrix
        learner_learner_kernel, expert_learner_kernel = self.__kernel_matrix(
            learner_seq, expert_seq,
            kernel_bandwidth)  # 2 * [batch_size*seq_len, batch_size*seq_len]
        learner_learner_kernel = tf.multiply(learner_learner_kernel,
                                             learner_learner_kernel_mask)
        expert_learner_kernel = tf.multiply(expert_learner_kernel,
                                            expert_learner_kernel_mask)
        # calculate reward for each of data point in learner sequence
        emp_ll_mean = tf.reduce_sum(learner_learner_kernel,
                                    axis=0) * 2  # batch_size*seq_len
        emp_el_mean = tf.reduce_sum(expert_learner_kernel,
                                    axis=0) * 2  # batch_size*seq_len
        return tf.expand_dims(emp_ll_mean - emp_el_mean,
                              -1)  # [batch_size*seq_len, 1]

    @staticmethod
    def __get_mask_truncate_by_T(seq_t, T, t_0=0):
        """Masking time, location and mark sequences for the entries before the maximum time T."""
        # get basic mask where 0 if t > T else 1
        mask_t = tf.multiply(tf.cast(seq_t < T, tf.float32),
                             tf.cast(seq_t > t_0, tf.float32))
        return mask_t  # [batch_size*seq_len, 1] or [batch_size, seq_len, 1]

    @staticmethod
    def __concatenate_batch(seqs):
        """Concatenate each batch of the sequences into a single sequence."""
        array_seq = tf.unstack(seqs, axis=0)  # [batch_size, seq_len, data_dim]
        seq = tf.concat(array_seq, axis=0)  # [batch_size*seq_len, data_dim]
        return seq

    @staticmethod
    def __kernel_matrix(learner_seq, expert_seq, kernel_bandwidth):
        """
        Construct kernel matrix based on learn sequence and expert sequence, each entry of the matrix 
        is the distance between two data points in learner_seq or expert_seq. return two matrix, left_mat 
        is the distances between learn sequence and learn sequence, right_mat is the distances between 
        learn sequence and expert sequence.
        """
        # calculate l2 distances
        learner_learner_mat = utils.l2_norm(
            learner_seq,
            learner_seq)  # [batch_size*seq_len, batch_size*seq_len]
        expert_learner_mat = utils.l2_norm(
            expert_seq,
            learner_seq)  # [batch_size*seq_len, batch_size*seq_len]
        # exponential kernel
        learner_learner_mat = tf.exp(-learner_learner_mat / kernel_bandwidth)
        expert_learner_mat = tf.exp(-expert_learner_mat / kernel_bandwidth)
        return learner_learner_mat, expert_learner_mat

    def train(
            self,
            sess,
            batch_size,
            epoches,  # number of epoches (how many times is the entire dataset going to be trained)
            expert_seq_t,  # [n, seq_len, 1]
            expert_seq_l,  # [n, seq_len, 2]
            trainplot=True,  # plot the change of intensity over epoches
            lr=1e-2,  # learning rate
            pretrained=False):
        """Train the point process generator given expert sequences."""
        # input tensors: expert sequences (time, location)
        self.input_seq_t = tf.placeholder(tf.float32, [batch_size, None, 1])
        self.input_seq_l = tf.placeholder(tf.float32, [batch_size, None, 2])

        # check the consistency of the shape of the expert sequences
        assert expert_seq_t.shape[:-1] == expert_seq_l.shape[:-1], \
            "inconsistant 'number of sequences' or 'sequence length' of input expert sequences"

        # initialization
        if not pretrained:
            print("[%s] parameters are initialized." % arrow.now(),
                  file=sys.stderr)
            # initialize network parameters
            init_op = tf.global_variables_initializer()
            sess.run(init_op)

        # data configurations
        # - number of expert sequences
        n_data = expert_seq_t.shape[0]
        # - number of batches
        n_batches = int(n_data / batch_size)

        if trainplot:
            ppim = utils.PointProcessIntensityMeter(self.T[1], batch_size)

        # training over epoches
        for epoch in range(epoches):
            # shuffle indices of the training samples
            shuffled_ids = np.arange(n_data)
            np.random.shuffle(shuffled_ids)
            # shuffled_train_ids = shuffled_ids[:n_train]
            # shuffled_test_ids  = shuffled_ids[-n_test:]

            # training over batches
            avg_train_cost = []
            for b in range(n_batches):
                idx = np.arange(batch_size * b, batch_size * (b + 1))
                # training and testing indices selected in current batch
                batch_train_ids = shuffled_ids[idx]
                # batch_test_ids  = shuffled_test_ids[:batch_size]
                # training and testing batch data
                batch_train_expert_t = expert_seq_t[batch_train_ids, :, :]
                batch_train_expert_l = expert_seq_l[batch_train_ids, :, :]
                self._rebulid_policy_optimizer(sess, batch_size, lr)
                # optimization procedure
                sess.run(self.optimizer,
                         feed_dict={
                             self.input_seq_t: batch_train_expert_t,
                             self.input_seq_l: batch_train_expert_l
                         })
                # cost for train batch and test batch
                train_cost = sess.run(self.cost,
                                      feed_dict={
                                          self.input_seq_t:
                                          batch_train_expert_t,
                                          self.input_seq_l:
                                          batch_train_expert_l
                                      })
                print("[%s] batch training cost: %.2f." %
                      (arrow.now(), train_cost),
                      file=sys.stderr)
                # record cost for each batch
                avg_train_cost.append(train_cost)

            if trainplot:
                # update intensity plot
                learner_seq_t, learner_seq_l, _ = self.hawkes.get_learner_seqs(
                    sess, self.T, self.S, batch_size)
                ppim.update_time_intensity(batch_train_expert_t, learner_seq_t)
                ppim.update_location_intensity(batch_train_expert_l,
                                               learner_seq_l)

            # training log output
            avg_train_cost = np.mean(avg_train_cost)
            print('[%s] Epoch %d (n_train_batches=%d, batch_size=%d)' %
                  (arrow.now(), epoch, n_batches, batch_size),
                  file=sys.stderr)
            print('[%s] Training cost:\t%f' % (arrow.now(), avg_train_cost),
                  file=sys.stderr)
Esempio n. 6
0
class RL_Hawkes_Generator(object):
    """
    Reinforcement Learning Based Point Process Generator
    """
    def __init__(self,
                 T,
                 S,
                 layers,
                 n_comp,
                 batch_size,
                 C=1.,
                 maximum=1e+3,
                 keep_latest_k=None,
                 lr=1e-5,
                 eps=0.2):
        """
        Params:
        - T: the maximum time of the sequences
        - S: the space of location
        - C: the constant in diffusion kernel
        """
        # model hyper-parameters
        self.T = T  # time space
        self.S = S  # location space
        self.batch_size = batch_size  # batch size
        self.maximum = maximum  # upper bound of the conditional intensity
        # Hawkes process generator
        self.hawkes = SpatialTemporalHawkes(T,
                                            S,
                                            layers=layers,
                                            n_comp=n_comp,
                                            C=C,
                                            maximum=1e+3,
                                            verbose=False)
        # input tensors: expert sequences (time, location)
        self.input_expert_seqs = tf.placeholder(tf.float32,
                                                [batch_size, None, 3])
        self.input_learner_seqs = tf.placeholder(tf.float32,
                                                 [batch_size, None, 3])
        # TODO: make esp decay exponentially
        # coaching
        # self.coached_learner_seqs = self._coaching(self.input_learner_seqs, self.input_expert_seqs, eps=eps)
        self.learner_seqs_loglik = self._log_likelihood(
            learner_seqs=self.input_learner_seqs, keep_latest_k=keep_latest_k)
        # build policy optimizer
        self._policy_optimizer(expert_seqs=self.input_expert_seqs,
                               learner_seqs=self.input_learner_seqs,
                               learner_seqs_loglik=self.learner_seqs_loglik,
                               lr=lr)

    def _log_likelihood(self, learner_seqs, keep_latest_k):
        """
        compute the log-likelihood of the input data given the hawkes point process. 
        """
        # max length of the sequence in learner_seqs
        max_len = tf.shape(learner_seqs)[1]
        # log-likelihoods
        logliklis = []
        for b in range(self.batch_size):
            seq = learner_seqs[b, :, :]
            mask_t = tf.cast(seq[:, 0] > 0, tf.float32)
            trunc_seq = tf.boolean_mask(seq, mask_t)
            seq_len = tf.shape(trunc_seq)[0]
            # calculate the log conditional pdf for each of data points in the sequence.
            loglikli = tf.scan(
                lambda a, i: self.hawkes.log_conditional_pdf(
                    trunc_seq[:i, :], keep_latest_k=keep_latest_k),
                tf.range(1, seq_len +
                         1),  # from the first point to the last point
                initializer=np.array(0., dtype=np.float32))
            # padding zeros for loglikli
            paddings = tf.zeros(max_len - seq_len, dtype=tf.float32)
            loglikli = tf.concat([loglikli, paddings], axis=0)
            logliklis.append(loglikli)
        logliklis = tf.expand_dims(tf.stack(logliklis, axis=0), -1)
        return logliklis

    def _policy_optimizer(self, expert_seqs, learner_seqs, learner_seqs_loglik,
                          lr):
        """policy optimizer"""
        # concatenate batches in the sequences
        concat_expert_seq = self.__concatenate_batch(
            expert_seqs)  # [batch_size * expert_seq_len, data_dim]
        concat_learner_seq = self.__concatenate_batch(
            learner_seqs)  # [batch_size * learner_seq_len, data_dim]
        concat_learner_seq_loglik = self.__concatenate_batch(
            learner_seqs_loglik)  # [batch_size * learner_seq_len, 1]

        # calculate average rewards
        print("[%s] building reward." % arrow.now(), file=sys.stderr)
        reward = self._reward(concat_expert_seq, concat_learner_seq)
        # TODO: record the discrepency

        # cost and optimizer
        print("[%s] building optimizer." % arrow.now(), file=sys.stderr)
        # self.cost      = tf.reduce_sum(tf.multiply(reward, concat_learner_seq_loglik), axis=0) / self.batch_size
        self.cost      = tf.reduce_sum( \
                         tf.reduce_sum(tf.reshape(reward, [self.batch_size, tf.shape(learner_seqs)[1]]), axis=1) * \
                         tf.reduce_sum(tf.reshape(concat_learner_seq_loglik, [self.batch_size, tf.shape(learner_seqs)[1]]), axis=1))  / self.batch_size
        # Adam optimizer
        global_step = tf.Variable(0, trainable=False)
        learning_rate = tf.train.exponential_decay(lr,
                                                   global_step,
                                                   decay_steps=100,
                                                   decay_rate=0.99,
                                                   staircase=True)
        self.optimizer = tf.train.AdamOptimizer(learning_rate,
                                                beta1=0.6,
                                                beta2=0.9).minimize(
                                                    self.cost,
                                                    global_step=global_step)

    def _reward(self, expert_seq, learner_seq, kb=5):
        """reward function"""
        # get mask for concatenated expert and learner sequences
        learner_mask_t = tf.expand_dims(
            tf.cast(learner_seq[:, 0] > 0, tf.float32), -1)
        expert_mask_t = tf.expand_dims(
            tf.cast(expert_seq[:, 0] > 0, tf.float32), -1)

        # calculate mask for kernel matrix
        learner_learner_kernel_mask = tf.matmul(learner_mask_t,
                                                tf.transpose(learner_mask_t))
        expert_learner_kernel_mask = tf.matmul(expert_mask_t,
                                               tf.transpose(learner_mask_t))

        # calculate upper-half kernel matrix
        # - [learner_seq_len, learner_seq_len], [expert_seq_len, learner_seq_len]
        learner_learner_kernel, expert_learner_kernel = self.__kernel_matrix(
            learner_seq, expert_seq, kb)

        learner_learner_kernel = tf.multiply(learner_learner_kernel,
                                             learner_learner_kernel_mask)
        expert_learner_kernel = tf.multiply(expert_learner_kernel,
                                            expert_learner_kernel_mask)

        # calculate reward for each of data point in learner sequence
        emp_ll_mean = tf.reduce_sum(
            learner_learner_kernel,
            axis=0) / self.batch_size  # [batch_size * learner_seq_len]
        emp_el_mean = tf.reduce_sum(
            expert_learner_kernel,
            axis=0) / self.batch_size  # [batch_size * learner_seq_len]
        return tf.expand_dims(emp_ll_mean - emp_el_mean,
                              -1)  # [batch_size * learner_seq_len, 1]

    def _coaching(self, learner_seqs, expert_seqs, eps):
        """
        coach the learner by replacing part of generated learner sequences with the expert 
        sequence for the (greedy) exploration.
        """
        # align learner and expert sequences
        learner_seqs, expert_seqs, seq_len = self.__align_learner_expert_seqs(
            learner_seqs, expert_seqs)
        # coaching and retain mask
        p = tf.random_uniform([self.batch_size, 1, 1], 0, 1)  # [batch_size, 1]
        coaching_mask = tf.tile(tf.cast(p <= eps, dtype=tf.float32),
                                [1, seq_len, 3])  # [batch_size, 1]
        retain_mask = 1. - coaching_mask
        # replace part of learner sequences by expert sequences
        learner_seqs = tf.multiply(learner_seqs, retain_mask) + tf.multiply(
            expert_seqs, coaching_mask)
        return learner_seqs

    @staticmethod
    def __align_learner_expert_seqs(learner_seqs, expert_seqs):
        """
        align learner sequences and expert sequences, i.e., make two batch of sequences have the same 
        sequence length by padding zeros to the tail.
        """
        batch_size = tf.shape(learner_seqs)[0]
        learner_seq_len = tf.shape(learner_seqs)[1]
        expert_seq_len = tf.shape(expert_seqs)[1]
        max_seq_len = tf.cond(tf.less(learner_seq_len, expert_seq_len),
                              lambda: expert_seq_len, lambda: learner_seq_len)
        learner_paddings = tf.zeros(
            [batch_size, max_seq_len - learner_seq_len, 3])
        expert_paddings = tf.zeros(
            [batch_size, max_seq_len - expert_seq_len, 3])
        learner_seqs = tf.concat([learner_seqs, learner_paddings], axis=1)
        expert_seqs = tf.concat([expert_seqs, expert_paddings], axis=1)
        return learner_seqs, expert_seqs, max_seq_len

    @staticmethod
    def __concatenate_batch(seqs):
        """Concatenate each batch of the sequences into a single sequence."""
        array_seq = tf.unstack(seqs, axis=0)  # [batch_size, seq_len, data_dim]
        seq = tf.concat(array_seq, axis=0)  # [batch_size*seq_len, data_dim]
        return seq

    @staticmethod
    def __kernel_matrix(learner_seq, expert_seq, kernel_bandwidth):
        """
        Construct kernel matrix based on learn sequence and expert sequence, each entry of the matrix 
        is the distance between two data points in learner_seq or expert_seq. return two matrix, left_mat 
        is the distances between learn sequence and learn sequence, right_mat is the distances between 
        learn sequence and expert sequence.
        """
        # calculate l2 distances
        learner_learner_mat = utils.l2_norm(
            learner_seq, learner_seq
        )  # [batch_size*learner_seq_len, batch_size*learner_seq_len]
        expert_learner_mat = utils.l2_norm(
            expert_seq, learner_seq
        )  # [batch_size*expert_seq_len, batch_size*learner_seq_len]
        # exponential kernel
        learner_learner_mat = tf.exp(-learner_learner_mat / kernel_bandwidth)
        expert_learner_mat = tf.exp(-expert_learner_mat / kernel_bandwidth)
        return learner_learner_mat, expert_learner_mat

    def mmd(self, sess, expert_seqs, learner_seqs):
        """
        """
        batch_size = expert_seqs.shape[1]
        # convert to tensors
        expert_seqs = tf.constant(expert_seqs, dtype=tf.float32)
        learner_seqs = tf.constant(learner_seqs, dtype=tf.float32)
        # concatenate batches in the sequences
        concat_expert_seq = self.__concatenate_batch(
            expert_seqs)  # [batch_size * expert_seq_len, data_dim]
        concat_learner_seq = self.__concatenate_batch(
            learner_seqs)  # [batch_size * learner_seq_len, data_dim]
        # calculate the reward (mmd)
        reward = tf.reduce_sum(
            self._reward(concat_expert_seq, concat_learner_seq)) / batch_size
        return sess.run(reward)

    def train(
            self,
            sess,
            epoches,  # number of epoches (how many times is the entire dataset going to be trained)
            expert_seqs,  # [n, seq_len, 3]
            trainplot=True,  # plot the change of intensity over epoches
            pretrained=False):
        """Train the point process generator given expert sequences."""

        # initialization
        if not pretrained:
            print("[%s] parameters are initialized." % arrow.now(),
                  file=sys.stderr)
            # initialize network parameters
            init_op = tf.global_variables_initializer()
            sess.run(init_op)

        # data configurations
        # - number of expert sequences
        n_data = expert_seqs.shape[0]
        # - number of batches
        n_batches = int(n_data / self.batch_size)

        # training over epoches
        all_train_cost = []
        for epoch in range(epoches):
            # shuffle indices of the training samples
            shuffled_ids = np.arange(n_data)
            np.random.shuffle(shuffled_ids)

            # training over batches
            avg_train_cost = []
            for b in range(n_batches):
                idx = np.arange(self.batch_size * b, self.batch_size * (b + 1))
                # training and testing indices selected in current batch
                batch_train_ids = shuffled_ids[idx]
                # training and testing batch data
                batch_train_expert = expert_seqs[batch_train_ids, :, :]
                batch_train_learner = self.hawkes.sampling(
                    sess, self.batch_size)
                # optimization procedure
                sess.run(self.optimizer,
                         feed_dict={
                             self.input_expert_seqs: batch_train_expert,
                             self.input_learner_seqs: batch_train_learner
                         })
                # cost for train batch and test batch
                train_cost = sess.run(self.cost,
                                      feed_dict={
                                          self.input_expert_seqs:
                                          batch_train_expert,
                                          self.input_learner_seqs:
                                          batch_train_learner
                                      })
                print("[%s] batch training cost: %.2f." %
                      (arrow.now(), train_cost),
                      file=sys.stderr)
                # record cost for each batch
                avg_train_cost.append(train_cost)
                all_train_cost.append(train_cost)

            # training log output
            avg_train_cost = np.mean(avg_train_cost)
            print('[%s] Epoch %d (n_train_batches=%d, batch_size=%d)' % \
                (arrow.now(), epoch, n_batches, self.batch_size), file=sys.stderr)
            print('[%s] Training cost:\t%f' % (arrow.now(), avg_train_cost),
                  file=sys.stderr)

        # save all training cost into numpy file.
        np.savetxt("results/robbery_rl_train_cost.txt",
                   all_train_cost,
                   delimiter=",")