コード例 #1
0
ファイル: PointProcess.py プロジェクト: nicktianboli/PoPPy
    def validation(self,
                   dataloader,
                   use_cuda,
                   verbose=True,
                   prob=1.0,
                   accuracy=False,
                   parameters=None):
        """
        Compute the avaraged loss per event of a generalized Hawkes process
        given observed sequences and current model
        :param dataloader: a pytorch batch-based data loader
        :param use_cuda: use cuda (true) or not (false)
        """
        device = torch.device('cuda:0' if use_cuda else 'cpu')
        self.lambda_model_validation.to(device)
        self.lambda_model_validation.eval()

        Cs = torch.LongTensor(
            list(range(len(dataloader.dataset.database['type2idx']))))
        Cs = Cs.view(-1, 1)
        Cs = Cs.to(device)

        if dataloader.dataset.database['event_features'] is not None:
            all_event_feature = torch.from_numpy(
                dataloader.dataset.database['event_features'])
            FCs = all_event_feature.type(torch.FloatTensor)
            FCs = torch.t(FCs)  # (num_type, dim_features)
            FCs = FCs.to(device)
        else:
            FCs = None
        if not accuracy:
            start = time.time()
            loss = 0
            prob = np.array([prob])
            prob_tensor = torch.from_numpy(prob).type(torch.FloatTensor)
            with torch.no_grad():
                for batch_idx, samples in enumerate(dataloader):
                    ci, batch_dict = samples2dict(samples, device, Cs, FCs)
                    lambda_t, Lambda_t = self.lambda_model_validation(
                        batch_dict)
                    lambda_t /= prob_tensor
                    Lambda_t /= prob_tensor
                    loss += self.loss_function(lambda_t, Lambda_t, ci)

                    # display training processes
                    if verbose:
                        if batch_idx % 100 == 0:
                            logger.info(
                                'Validation [{}/{} ({:.0f}%)]\t Time={:.2f}sec.'
                                .format(batch_idx * ci.size(0),
                                        len(dataloader.dataset),
                                        100. * batch_idx / len(dataloader),
                                        time.time() - start))
            return loss / len(dataloader.dataset)
        else:
            with torch.no_grad():
                loss = np.linalg.norm(
                    list(self.lambda_model_validation.parameters())
                    [1].data.numpy() - parameters) / self.num_type**2
            return loss
コード例 #2
0
ファイル: PointProcess.py プロジェクト: nicktianboli/PoPPy
 def print_info(self):
     """
     Print basic information of the model.
     """
     logger.info(self.model_name)
     self.lambda_model.print_info()
     logger.info("The loss function is {}.".format(self.loss_function))
コード例 #3
0
ファイル: DecayKernel.py プロジェクト: harjotspahwa/PoPPy
 def print_info(self):
     """
     Print basic information of the kernel model.
     """
     logger.info('The type of decay kernel: {}.'.format(self.kernel_type))
     logger.info('The number of basis = {}.'.format(
         self.parameters.size(1)))
コード例 #4
0
 def print_info(self):
     """
     Print basic information of the exogenous intensity function.
     """
     logger.info('Exogenous intensity function: mu(t) = {}.'.format(
         self.exogenous_intensity_type))
     logger.info('The number of event types = {}.'.format(self.num_type))
コード例 #5
0
ファイル: EndogenousImpact.py プロジェクト: camroach87/PoPPy
 def print_info(self):
     """
     Print basic information of the exogenous intensity function.
     """
     logger.info("Endogenous impact function: phi_(kk')(t) = {}.".format(
         self.endogenous_impact_type))
     logger.info('The number of event types = {}.'.format(self.num_type))
     self.decay_kernel.print_info()
コード例 #6
0
ファイル: DataOperation.py プロジェクト: xiaotinghe/PoPPy
def aggregating(database, dt):
    """
    Count the number of events in predefined time bins,
    and convert event sequences to aggregate time series
    :param database: the observed event sequences
    :param dt: a float number indicating the length of time bin.
    :return:
        the output's format is shown as follows:

          output = {'event_features': None or (De, C) float array of event's static features,
                              C is the number of event types.
                    'type2idx': a Dict = {'event_name': event_index}
                    'idx2type': a Dict = {event_index: 'event_name'}
                    'seq2idx': a Dict = {'seq_name': seq_index}
                    'idx2seq': a Dict = {seq_index: 'seq_name'}
                    'sequences': a List  = {seq_1, seq_2, ..., seq_N}.
                    }

        For the i-th sequence:
        seq_i = {'times': (N,) float array of discrete timestamps,
                          N = [(t_stop - t_start)/dt] is the number of bins.
                 'events': (N, C) int array of event types,
                           events[n, c] counts the number of type-c events in the n-th bin
                 'seq_feature': None or (Ds,) float array of sequence's static feature.
                 't_start': a float number indicating the start timestamp of the sequence.
                 't_stop': a float number indicating the stop timestamp of the sequence.
                 'label': None or int/float number indicating the labels of the sequence}
    """
    start = time.time()
    output = copy.deepcopy(database)
    num_types = len(database['type2idx'])
    logger.info('aggregation of event sequences is applied...')

    for i in range(len(database['sequences'])):
        seq_i = database['sequences'][i]
        num_bins = round((seq_i['t_stop'] - seq_i['t_start']) / dt) + 1
        times = np.zeros((num_bins, ))
        events = np.zeros((num_bins, num_types))

        for n in range(num_bins):
            times[n] = seq_i['t_start'] + (n + 1) * dt

        for k in range(seq_i['times'].shape[0]):
            n = int(round((seq_i['times'][k] - seq_i['t_start']) / dt))
            c = seq_i['events'][k]
            events[n, c] += 1

        output['sequences'][i]['times'] = times
        output['sequences'][i]['events'] = events

        if i % 1000 == 0:
            logger.info(
                '{} sequences have been aggregated... Time={}ms.'.format(
                    i, round(1000 * (time.time() - start))))

    return output
コード例 #7
0
ファイル: DataOperation.py プロジェクト: zehsilva/PoPPy
    def __init__(self, database, memorysize):
        """
        :param database: the observed event sequences
            database = {'event_features': None or (C, De) float array of event's static features,
                                      C is the number of event types.
                        'type2idx': a Dict = {'event_name': event_index}
                        'idx2type': a Dict = {event_index: 'event_name'}
                        'seq2idx': a Dict = {'seq_name': seq_index}
                        'idx2seq': a Dict = {seq_index: 'seq_name'}
                        'sequences': a List  = {seq_1, seq_2, ..., seq_N}.
                        }

            For the i-th sequence:
            seq_i = {'times': (N,) float array of timestamps, N is the number of events.
                     'events': (N,) int array of event types.
                     'seq_feature': None or (Ds,) float array of sequence's static feature.
                     't_start': a float number indicating the start timestamp of the sequence.
                     't_stop': a float number indicating the stop timestamp of the sequence.
                     'label': None or int/float number indicating the labels of the sequence}
        :param memorysize: how many historical events remembered by each event
        """
        self.event_cell = []
        self.time_cell = []
        self.database = database
        self.memory_size = memorysize
        for i in range(len(database['sequences'])):
            seq_i = database['sequences'][i]
            times = seq_i['times']
            events = seq_i['events']
            t_start = seq_i['t_start']
            print(events.shape)
            for j in range(len(events)):
                target = events[j]
                # former = np.zeros((memorysize,), dtype=np.int)
                # former = np.random.permutation(len(self.database['type2idx']))
                # former = former[:memorysize]
                former = np.random.choice(len(self.database['type2idx']),
                                          memorysize)
                target_t = times[j]
                former_t = t_start * np.ones((memorysize, ))

                if 0 < j < memorysize:
                    former[-j:] = events[:j]
                    former_t[-j:] = times[:j]
                elif j >= memorysize:
                    former = events[j - memorysize:j]
                    former_t = times[j - memorysize:j]

                self.event_cell.append((target, former, i))
                self.time_cell.append((target_t, former_t))
        logger.info('In this dataset, the number of events = {}.'.format(
            len(self.event_cell)))
        logger.info(
            'Each event is influenced by its last {} historical events.'.
            format(self.memory_size))
コード例 #8
0
ファイル: EndogenousImpact.py プロジェクト: camroach87/PoPPy
 def plot_and_save(self, infect: torch.Tensor, output_name: str = None):
     """
     Plot endogenous impact function for all event types
     Args:
     :param infect: a (num_type, num_type+1, M) FloatTensor containing all endogenous impact
     :param output_name: the name of the output png file
     """
     impact = infect.sum(2).data.cpu().numpy()
     plt.figure(figsize=(5, 5))
     plt.imshow(impact)
     plt.colorbar()
     if output_name is None:
         plt.savefig('endogenous_impact.png')
     else:
         plt.savefig(output_name)
     plt.close("all")
     logger.info("Done!")
コード例 #9
0
    def validation(self, dataloader, use_cuda):
        """
        Compute the avaraged loss per event of a generalized Hawkes process given observed sequences and current model
        :param dataloader: a pytorch batch-based data loader
        :param use_cuda: use cuda (true) or not (false)
        """
        device = torch.device('cuda:0' if use_cuda else 'cpu')
        self.lambda_model.to(device)
        self.lambda_model.eval()

        Cs = torch.LongTensor(
            list(range(len(dataloader.dataset.database['type2idx']))))
        Cs = Cs.view(-1, 1)
        Cs = Cs.to(device)

        if dataloader.dataset.database['event_features'] is not None:
            all_event_feature = torch.from_numpy(
                dataloader.dataset.database['event_features'])
            FCs = all_event_feature.type(torch.FloatTensor)
            FCs = torch.t(FCs)  # (num_type, dim_features)
            FCs = FCs.to(device)
        else:
            FCs = None

        start = time.time()
        loss = 0
        for batch_idx, samples in enumerate(dataloader):
            ci, batch_dict = samples2dict(samples, device, Cs, FCs)
            loss = 0
            for m in range(self.num_cluster):
                weight = self.responsibility[batch_dict['sn'][:, 0],
                                             m]  # (batch_size, )
                lambda_t, Lambda_t = self.lambda_model[m](batch_dict)
                loss_m = self.loss_function(lambda_t, Lambda_t,
                                            ci)  # (batch_size, )
                loss += (weight * loss_m).sum() / loss_m.size(0)

            # display training processes
            if batch_idx % 100 == 0:
                logger.info(
                    'Validation [{}/{} ({:.0f}%)]\t Time={:.2f}sec.'.format(
                        batch_idx * ci.size(0), len(dataloader.dataset),
                        100. * batch_idx / len(dataloader),
                        time.time() - start))
        return loss / len(dataloader.dataset)
コード例 #10
0
    def plot_and_save(self, mu_all: torch.Tensor, output_name: str = None):
        """
        Plot the stem plot of exogenous intensity functions for all event types
        Args:
        :param mu_all: a (num_type, 1) FloatTensor containing all exogenous intensity functions
        :param output_name: the name of the output png file
        """
        mu_all = mu_all.squeeze(1)  # (C,)
        mu_all = mu_all.data.cpu().numpy()

        plt.figure(figsize=(5, 5))
        plt.stem(range(mu_all.shape[0]), mu_all, '-')
        plt.ylabel('Exogenous intensity')
        plt.xlabel('Index of event type')
        if output_name is None:
            plt.savefig('exogenous_intensity.png')
        else:
            plt.savefig(output_name)
        plt.close("all")
        logger.info("Done!")
コード例 #11
0
ファイル: DecayKernel.py プロジェクト: harjotspahwa/PoPPy
    def plot_and_save(self, t_stop: float = 5.0, output_name: str = None):
        """
        Plot decay function and its integration and save the figure as a png file
        Args:
            t_stop (float): the end of timestamp
            output_name (str): the name of the output png file
        """

        dt = np.arange(0.0, t_stop, 0.01)
        dt = np.tile(dt, (1, 1))
        dt = torch.from_numpy(dt)
        dt = dt.type(torch.FloatTensor)
        gt = self.values(dt)
        # t_start = torch.zeros(dt.size())
        igt = self.integrations(dt)
        # print(gt.shape)

        plt.figure(figsize=(5, 5))
        for k in range(gt.shape[2]):
            plt.plot(dt[0, :].cpu().numpy(),
                     gt[0, :, k].cpu().numpy(),
                     label='g_{}(t)'.format(k),
                     c='r')
            plt.plot(dt[0, :].cpu().numpy(),
                     igt[0, :, k].cpu().numpy(),
                     label='G_{}(t)'.format(k),
                     c='b')
        leg = plt.legend(loc='upper left', ncol=1, shadow=True, fancybox=True)
        leg.get_frame().set_alpha(0.5)
        plt.title('{} decay kernel and its integration'.format(
            self.kernel_type))
        if output_name is None:
            plt.savefig('{}_decay_kernel.png'.format(self.kernel_type))
        else:
            plt.savefig(output_name)
        plt.close("all")
        logger.info("Done!")
コード例 #12
0
 def print_info(self):
     """
     Print basic information of the model.
     """
     logger.info(self.model_name)
     for m in range(self.num_cluster):
         logger.info('Component {}, probability = {:.6f}'.format(
             m, self.prob_cluster[m]))
         self.lambda_model[m].print_info()
     logger.info("The loss function is {}.".format(self.loss_function))
コード例 #13
0
ファイル: PointProcess.py プロジェクト: xiaotinghe/PoPPy
 def save_model(self, full_path, mode: str='entire'):
     """
     Save trained model
     :param full_path: the path of directory
     :param mode: 'parameter' for saving only parameters of the model,
                  'entire' for saving entire model
     """
     if mode == 'entire':
         torch.save(self.lambda_model, full_path)
         logger.info('The entire model is saved in {}.'.format(full_path))
     elif mode == 'parameter':
         torch.save(self.lambda_model.state_dict(), full_path)
         logger.info('The parameters of the model is saved in {}.'.format(full_path))
     else:
         logger.warning("'{}' is a undefined mode, we use 'entire' mode instead.".format(mode))
         torch.save(self.lambda_model, full_path)
         logger.info('The entire model is saved in {}.'.format(full_path))
コード例 #14
0
    def fit(self,
            dataloader,
            optimizer,
            epochs: int,
            scheduler=None,
            sparsity: float = None,
            nonnegative=None,
            use_cuda: bool = False,
            validation_set=None,
            track_diagnostics=False):
        """
        Learn parameters of a generalized Hawkes process given observed sequences
        :param dataloader: a pytorch batch-based data loader
        :param optimizer: the sgd optimization method defined by PyTorch
        :param epochs: the number of training epochs
        :param scheduler: the method adjusting the learning rate of SGD defined by PyTorch
        :param sparsity: None or a float weight of L1 regularizer
        :param nonnegative: None or a float lower bound, typically the lower bound = 0
        :param use_cuda: use cuda (true) or not (false)
        :param validation_set: None or a validation dataloader
        :param track_diagnostics: Set to True to return historical loss values and weights.
        """
        device = torch.device('cuda:0' if use_cuda else 'cpu')
        self.lambda_model.to(device)
        best_model = None
        self.lambda_model.train()

        if nonnegative is not None:
            clipper = LowerBoundClipper(nonnegative)

        Cs = torch.LongTensor(
            list(range(len(dataloader.dataset.database['type2idx']))))
        Cs = Cs.view(-1, 1)
        Cs = Cs.to(device)

        if dataloader.dataset.database['event_features'] is not None:
            all_event_feature = torch.from_numpy(
                dataloader.dataset.database['event_features'])
            FCs = all_event_feature.type(torch.FloatTensor)
            FCs = torch.t(FCs)  # (num_type, dim_features)
            FCs = FCs.to(device)
        else:
            FCs = None

        if validation_set is not None:
            validation_loss = self.validation(validation_set, use_cuda)
            logger.info(
                'In the beginning, validation loss per event: {:.6f}.\n'.
                format(validation_loss))
            best_loss = validation_loss
        else:
            best_loss = np.inf

        if track_diagnostics:
            self.diagnostics = Diagnostics()

        for epoch in range(epochs):
            if scheduler is not None:
                scheduler.step()
            start = time.time()
            for batch_idx, samples in enumerate(dataloader):
                ci, batch_dict = samples2dict(samples, device, Cs, FCs)
                optimizer.zero_grad()
                lambda_t, Lambda_t = self.lambda_model(batch_dict)
                loss = self.loss_function(lambda_t, Lambda_t,
                                          ci) / lambda_t.size(0)
                reg = 0
                if sparsity is not None:
                    for parameter in self.lambda_model.parameters():
                        reg += sparsity * torch.sum(torch.abs(parameter))
                loss_total = loss + reg
                loss_total.backward()
                optimizer.step()
                if nonnegative is not None:
                    self.lambda_model.apply(clipper)

                if track_diagnostics:
                    self.diagnostics.loss.append(loss.data.item())
                    self.diagnostics.mu.append(
                        self.lambda_model.exogenous_intensity.emb.weight.
                        squeeze().tolist())
                    self.diagnostics.alpha.append(
                        self.lambda_model.endogenous_intensity.basis[0].weight.
                        squeeze().tolist())

                # display training processes
                if batch_idx % 100 == 0:
                    logger.info('Train Epoch: {} [{}/{} ({:.0f}%)]'.format(
                        epoch, batch_idx * ci.size(0), len(dataloader.dataset),
                        100. * batch_idx / len(dataloader)))
                    if sparsity is not None:
                        logger.info(
                            'Loss per event: {:.6f}, Regularizer: {:.6f} Time={:.2f}sec'
                            .format(loss.data, reg.data,
                                    time.time() - start))
                    else:
                        logger.info(
                            'Loss per event: {:.6f}, Regularizer: {:.6f} Time={:.2f}sec'
                            .format(loss.data, 0,
                                    time.time() - start))

            if validation_set is not None:
                validation_loss = self.validation(validation_set, use_cuda)
                logger.info(
                    'After Epoch: {}, validation loss per event: {:.6f}.\n'.
                    format(epoch, validation_loss))
                if validation_loss < best_loss:
                    best_model = copy.deepcopy(self.lambda_model)
                    best_loss = validation_loss

        if best_model is not None:
            self.lambda_model = copy.deepcopy(best_model)
コード例 #15
0
ファイル: PointProcess.py プロジェクト: nicktianboli/PoPPy
    def fit(self,
            dataloader,
            optimizer,
            epochs: int,
            scheduler=None,
            sparsity: float = None,
            nonnegative=None,
            use_cuda: bool = False,
            validation_set=None,
            verbose=True,
            prob: float = 1.0,
            accuracy=False,
            parameters=None):
        """
        Learn parameters of a generalized Hawkes process given observed sequences
        :param dataloader: a pytorch batch-based data loader
        :param optimizer: the sgd optimization method defined by PyTorch
        :param epochs: the number of training epochs
        :param scheduler: the method adjusting the learning rate of SGD defined by PyTorch
        :param sparsity: None or a float weight of L1 regularizer
        :param nonnegative: None or a float lower bound, typically the lower bound = 0
        :param use_cuda: use cuda (true) or not (false)
        :param validation_set: None or a validation dataloader
        """
        device = torch.device('cuda:0' if use_cuda else 'cpu')
        self.lambda_model.to(device)
        best_model = None
        self.lambda_model.train()
        self.mu_path.append(
            copy.deepcopy(list(self.lambda_model.parameters())[0].data))
        self.alpha_path.append(
            copy.deepcopy(list(self.lambda_model.parameters())[1].data))

        if nonnegative is not None:
            clipper = LowerBoundClipper(nonnegative)

        Cs = torch.LongTensor(
            list(range(len(dataloader.dataset.database['type2idx']))))
        Cs = Cs.view(-1, 1)
        Cs = Cs.to(device)

        if dataloader.dataset.database['event_features'] is not None:
            all_event_feature = torch.from_numpy(
                dataloader.dataset.database['event_features'])
            FCs = all_event_feature.type(torch.FloatTensor)
            FCs = torch.t(FCs)  # (num_type, dim_features)
            FCs = FCs.to(device)
        else:
            FCs = None

        if validation_set is not None:
            validation_loss = self.validation(validation_set, use_cuda,
                                              verbose, prob, accuracy,
                                              parameters)
            logger.info(
                'In the beginning, validation loss per event: {:.6f}.\n'.
                format(validation_loss))
            best_loss = validation_loss
            self.learning_path.append(validation_loss)
        else:
            best_loss = np.inf

        start0 = time.time()

        self.training_time.append(time.time() - start0)
        for epoch in range(epochs):
            if scheduler is not None:
                scheduler.step()
            start = time.time()

            for batch_idx, samples in enumerate(dataloader):
                ci, batch_dict = samples2dict(samples, device, Cs, FCs)
                optimizer.zero_grad()
                lambda_t, Lambda_t = self.lambda_model(batch_dict)
                loss = self.loss_function(lambda_t, Lambda_t,
                                          ci) / lambda_t.size(0)
                reg = 0
                if sparsity is not None:
                    for parameter in self.lambda_model.parameters():
                        reg += sparsity * torch.sum(torch.abs(parameter))
                loss_total = loss + reg
                loss_total.backward()
                optimizer.step()
                if nonnegative is not None:
                    self.lambda_model.apply(clipper)

                if validation_set is not None:
                    validation_loss = self.validation(validation_set, use_cuda,
                                                      verbose, prob, accuracy,
                                                      parameters)
                    if verbose:
                        logger.info(
                            'After Epoch: {}, validation loss per event: {:.6f}.\n'
                            .format(epoch, validation_loss))
                    if validation_loss < best_loss:
                        best_model = copy.deepcopy(self.lambda_model)
                        best_loss = validation_loss

                esti_loss = loss_total.data

                # display training processes
                if verbose:
                    if batch_idx % 100 == 0:
                        logger.info('Train Epoch: {} [{}/{} ({:.0f}%)]'.format(
                            epoch, batch_idx * ci.size(0),
                            len(dataloader.dataset),
                            100. * batch_idx / len(dataloader)))
                        if sparsity is not None:
                            logger.info(
                                'Loss per event: {:.3f}, Regularizer: {:.3f}, Validate Loss: {:.3f}, Time={:.2f}sec'
                                .format(esti_loss.data, reg.data,
                                        validation_loss,
                                        time.time() - start))
                        else:
                            logger.info(
                                'Loss per event: {:.3f}, Regularizer: {:.3f}, Loss: {:.6f}, Time={:.2f}sec'
                                .format(esti_loss.data, 0, validation_loss,
                                        time.time() - start))

                self.learning_path.append(loss_total)
                self.validation_path.append(validation_loss)
                self.training_time.append(time.time() - start0)
                self.mu_path.append(
                    copy.deepcopy(
                        list(self.lambda_model.parameters())[0].data))
                self.alpha_path.append(
                    copy.deepcopy(
                        list(self.lambda_model.parameters())[1].data))
                self.lambda_path.append(lambda_t)
                self.Lambda_path.append(Lambda_t)

            logger.info(
                'Epoch : {}/{}, Used time: {: .2f} min, Estimated Time to finish: {: .2f} min, train loss: {: .3f}, validation loss: {: .3f}'
                .format((epoch + 1), epochs, self.training_time[-1] / 60,
                        self.training_time[-1] / 60 / (epoch + 1) *
                        (epochs - epoch - 1), loss_total, validation_loss))

        if best_model is not None:
            self.lambda_model = copy.deepcopy(best_model)
コード例 #16
0
ファイル: PointProcess.py プロジェクト: nicktianboli/PoPPy
    def simulate(self,
                 history,
                 memory_size: int = 10,
                 time_window: float = 1.0,
                 interval: float = 1.0,
                 max_number: int = 100,
                 use_cuda: bool = False):
        """
        Simulate one or more event sequences from given model.
        :param history: historical observations
            history = {'event_features': None or (C, De) float array of event's static features,
                                  C is the number of event types.
                       'type2idx': a Dict = {'event_name': event_index}
                       'idx2type': a Dict = {event_index: 'event_name'}
                       'seq2idx': a Dict = {'seq_name': seq_index}
                       'idx2seq': a Dict = {seq_index: 'seq_name'}
                       'sequences': a List  = {seq_1, seq_2, ..., seq_N}.
                      }

            For the i-th sequence:
            seq_i = {'times': (N,) float array of timestamps, N is the number of events.
                     'events': (N,) int array of event types.
                     N can be "0" (i.e., no observations)
                     'seq_feature': None or (Ds,) float array of sequence's static feature.
                     't_start': a float number indicating the start timestamp of the sequence.
                     't_stop': a float number indicating the stop timestamp of the sequence.
                     'label': None or int/float number indicating the labels of the sequence}
        :param memory_size: the number of historical events used for simulation
        :param time_window: duration of simulation process.
        :param interval: the interval size calculating the supremum of intensity
        :param max_number: the maximum number of simulated events
        :param use_cuda: use cuda (true) or not (false)
        :return:
            new_data: having the same format as history
            counts: a list of (C,) ndarray, which counts the number of simulated events for each type
        """
        device = torch.device('cuda:0' if use_cuda else 'cpu')
        self.lambda_model.to(device)
        self.lambda_model.eval()

        Cs = torch.LongTensor(list(range(len(history['type2idx']))))
        Cs = Cs.view(-1, 1)
        Cs = Cs.to(device)
        if history['event_features'] is not None:
            all_event_feature = torch.from_numpy(history['event_features'])
            FCs = all_event_feature.type(torch.FloatTensor)
            FCs = torch.t(FCs)  # (num_type, dim_features)
            FCs = FCs.to(device)
        else:
            FCs = None

        t_start = time.time()
        new_data = copy.deepcopy(history)
        # the number of new synthetic events for each type
        counts = np.zeros((self.num_type, len(new_data['sequences'])))
        for i in range(len(new_data['sequences'])):
            times_tmp = []
            events_tmp = []

            # initial point
            new_data['sequences'][i]['t_start'] = history['sequences'][i][
                't_stop']
            new_data['sequences'][i][
                't_stop'] = history['sequences'][i]['t_stop'] + time_window
            t_now = new_data['sequences'][i]['t_start'] + 0.01

            # initialize the input of intensity function
            ci = Cs
            # print(ci)
            ci = ci.to(device)
            ti = torch.FloatTensor([t_now])
            ti = ti.to(device)
            ti = ti.view(1, 1)
            ti = ti.repeat(ci.size(0), 1)

            events = history['sequences'][i]['events']
            times = history['sequences'][i]['times']
            if times is None:
                tjs = torch.FloatTensor([new_data['sequences'][i]['t_start']])
                tjs = tjs.to(device)
                cjs = torch.LongTensor(
                    [np.random.permutation(self.num_type)[0]])
                cjs = cjs.to(device)
            else:
                if memory_size > times.shape[0]:
                    tjs = torch.from_numpy(times)
                    tjs = tjs.type(torch.FloatTensor)
                    tjs = tjs.to(device)
                    cjs = torch.from_numpy(events)
                    cjs = cjs.type(torch.LongTensor)
                    cjs = cjs.to(device)
                else:
                    tjs = torch.from_numpy(times[-memory_size:])
                    tjs = tjs.type(torch.FloatTensor)
                    tjs = tjs.to(device)
                    cjs = torch.from_numpy(events[-memory_size:])
                    cjs = cjs.type(torch.LongTensor)
                    cjs = cjs.to(device)

            tjs = tjs.to(device)
            tjs = tjs.view(1, -1)
            tjs = tjs.repeat(ci.size(0), 1)
            cjs = cjs.to(device)
            cjs = cjs.view(1, -1)
            cjs = cjs.repeat(ci.size(0), 1)

            sn = torch.LongTensor([i])
            sn = sn.to(device)
            sn = sn.view(1, 1)
            sn = sn.repeat(ci.size(0), 1)

            if history['sequences'][i]['seq_feature'] is not None:
                fsn = history['sequences'][i]['seq_feature']
                fsn = torch.from_numpy(fsn)
                fsn = fsn.type(torch.FloatTensor)
                fsn = fsn.view(1, -1).repeat(ci.size(0), 1)
                fsn = fsn.to(device)
            else:
                fsn = None

            if FCs is None:
                fci = None
                fcjs = None
            else:
                fci = FCs[ci[:, 0], :]
                fcjs = FCs[cjs, :]
                fcjs = torch.transpose(fcjs, 1, 2)
                fcjs = fcjs.to(device)

            sample_dict = {
                'ti': ti,
                'tjs': tjs,
                'ci': ci,
                'cjs': cjs,
                'sn': sn,
                'fsn': fsn,
                'fci': fci,
                'fcjs': fcjs,
                'Cs': Cs,
                'FCs': FCs
            }

            while t_now < new_data['sequences'][i]['t_stop'] and len(
                    times_tmp) < max_number:
                lambda_t = self.lambda_model.intensity(sample_dict)
                sample_dict['ti'] = sample_dict['ti'] + interval
                lambda_t2 = self.lambda_model.intensity(sample_dict)
                mt = max([float(lambda_t.sum()), float(lambda_t2.sum())])

                s = np.random.exponential(1 / mt)
                if s < interval:
                    sample_dict['ti'] = sample_dict['ti'] + s - interval
                    ti = sample_dict['ti'].cpu().numpy()
                    t_now = ti[0, 0]  # float
                    lambda_s = self.lambda_model.intensity(sample_dict)
                    ms = float(lambda_s.sum())

                    u = np.random.rand()
                    ratio = ms / mt
                    if ratio > u:  # generate a new event
                        prob = lambda_s.data.cpu().numpy() / ms
                        prob = prob[:, 0]
                        # print(prob.shape)
                        # print(self.num_type)
                        ci = np.random.choice(self.num_type, p=prob)  # int

                        # add to new sequence
                        times_tmp.append(t_now)
                        events_tmp.append(ci)
                        counts[ci, i] += 1

                        # update batch_dict
                        ti = torch.FloatTensor([t_now])
                        ti = ti.to(device)
                        ti = ti.view(1, 1).repeat(self.num_type, 1)
                        ci = torch.LongTensor([ci])
                        ci = ci.to(device)
                        ci = ci.view(1, 1).repeat(self.num_type, 1)
                        if memory_size > sample_dict['cjs'].size(1):
                            # print(sample_dict['cjs'].size())
                            # print(ci.size())
                            sample_dict['cjs'] = torch.cat(
                                [sample_dict['cjs'], ci], dim=1)
                            sample_dict['tjs'] = torch.cat(
                                [sample_dict['tjs'], ti], dim=1)
                        else:
                            sample_dict['cjs'] = torch.cat(
                                [sample_dict['cjs'][:, -memory_size + 1:], ci],
                                dim=1)
                            sample_dict['tjs'] = torch.cat(
                                [sample_dict['tjs'][:, -memory_size + 1:], ti],
                                dim=1)
                        if FCs is not None:
                            sample_dict['fcjs'] = FCs[sample_dict['cjs'], :]
                            sample_dict['fcjs'] = torch.transpose(
                                sample_dict['fcjs'], 1, 2)
                else:
                    ti = sample_dict['ti'].cpu().numpy()
                    t_now = ti[0, 0]  # float

            if i % 500 == 0:
                logger.info(
                    'Sequence {}/{} has been generated... Time={:.2}sec.'.
                    format(i, len(new_data['sequences']),
                           time.time() - t_start))
            times_tmp = np.asarray(times_tmp)
            events_tmp = np.asarray(events_tmp)
            new_data['sequences'][i]['times'] = times_tmp
            new_data['sequences'][i]['events'] = events_tmp
        return new_data, counts
コード例 #17
0
def build_dict_mimic3(diagnose_dict_path: str, diagnose_adm_path: str,
                      procedure_dict_path: str, procedure_adm_path: str,
                      min_count: int):
    """
    This function builds the icd code database

    Args:
        diagnose_dict_path: the path of diagnose icd code list (csv)
        diagnose_adm_path: diagnose_adm_path: the full path of admission diagnose csv file
        procedure_dict_path: procedure_dict_path: the path of procedure icd code list (csv)
        procedure_adm_path: procedure_adm_path: the full path of admission procedure csv file
        min_count: the minimum counts of ICD code

    Returns:
        database = {src_index: the dictionary mapping diagnose ICD code to index
                    src_title: the dictionary mapping diagnose ICD code to its description
                    tar_index: the dictionary mapping procedure ICD code to index
                    tar_title: the dictionary mapping procedure ICD code to its description
                    src_interactions: the diagnose pairs
                    tar_interactions: the procedure pairs
                    mutual_interactions: the list containing the admission with diseases and procedures
                    }

    """
    df_diagnose = pd.read_csv(
        diagnose_adm_path)  # , encoding="ISO-8859-1")#"utf8")
    diag_counts = df_diagnose['ICD9_CODE'].value_counts()
    diag2idx = {}
    idx = 0
    for icd in diag_counts.keys():
        if diag_counts[icd] > min_count:
            diag2idx[str(icd)] = idx
            idx += 1

    df_procedure = pd.read_csv(
        procedure_adm_path)  # , encoding="ISO-8859-1")#"utf8")
    proc_counts = df_procedure['ICD9_CODE'].value_counts()
    proc2idx = {}
    idx = 0
    for icd in proc_counts.keys():
        if proc_counts[icd] > min_count:
            proc2idx[str(icd)] = idx
            idx += 1

    diag2title = {}
    df_diagnose = pd.read_csv(
        diagnose_dict_path)  # , encoding="ISO-8859-1")#"utf8")
    idx = 0
    for i, row in df_diagnose.iterrows():
        icd = str(row['ICD9_CODE'])
        des = str(row['LONG_TITLE'])
        if icd in diag2idx.keys():
            diag2title[icd] = des
            idx += 1
    logger.info('{} kinds of diagnoses are found.'.format(len(diag2idx)))

    proc2title = {}
    df_procedure = pd.read_csv(
        procedure_dict_path)  # , encoding="ISO-8859-1")#"utf8")
    idx = 0
    for i, row in df_procedure.iterrows():
        icd = str(row['ICD9_CODE'])
        des = str(row['LONG_TITLE'])
        if icd in proc2idx.keys():
            proc2title[icd] = des
            idx += 1
    logger.info('{} kinds of procedures are found.'.format(len(proc2idx)))

    diag_adm = {}
    df_diagnose = pd.read_csv(
        diagnose_adm_path)  # , encoding="ISO-8859-1")#"utf8")
    for i, row in df_diagnose.iterrows():
        adm = str(row['HADM_ID'])
        icd = str(row['ICD9_CODE'])
        if icd in diag2idx.keys():
            if adm not in diag_adm.keys():
                diag_adm[adm] = [diag2idx[icd]]
            else:
                diag_adm[adm].append(diag2idx[icd])
        if i % 10000 == 0:
            logger.info('{}/{} rows are processed.'.format(
                i, len(df_diagnose)))
    logger.info('{} diagnose admissions are found.'.format(len(diag_adm)))

    proc_adm = {}
    df_procedure = pd.read_csv(
        procedure_adm_path)  # , encoding="ISO-8859-1")#"utf8")
    for i, row in df_procedure.iterrows():
        adm = str(row['HADM_ID'])
        icd = str(row['ICD9_CODE'])
        if icd in proc2idx.keys():
            if adm not in proc_adm.keys():
                proc_adm[adm] = [proc2idx[icd]]
            else:
                proc_adm[adm].append(proc2idx[icd])
        if i % 10000 == 0:
            logger.info('{}/{} rows are processed.'.format(
                i, len(df_procedure)))
    logger.info('{} procedure admissions are found.'.format(len(proc_adm)))

    diag_w_proc = []
    for adm in diag_adm.keys():
        if adm in proc_adm.keys():
            diag_w_proc.append([diag_adm[adm], proc_adm[adm]])

    database = {
        'src_index': diag2idx,
        'src_title': diag2title,
        'tar_index': proc2idx,
        'tar_title': proc2title,
        'src_interactions': diag_adm,
        'tar_interactions': proc_adm,
        'mutual_interactions': diag_w_proc
    }
    return database
コード例 #18
0
ファイル: DataIO.py プロジェクト: nicktianboli/PoPPy
def load_seq_labels_csv(file_name: str, seq_domain: str, domain_dict: Dict,
                        database: Dict):
    """
    load sequences' features from a csv file
    :param file_name: the path and the name of the csv file
    :param seq_domain: the name of the key column corresponding to sequence index.
    :param domain_dict: a dictionary containing the name of the key column corresponding to the labels.
        The format should be
            domain_dict = {'domain_name': domain's feature type}
        The dictionary should only contain one key.
        If multiple keys are provided, only the first one is considered.

        Two types are considered:
        1) 'numerical': each element (row) in the corresponding domain should be a string containing D numbers
            separated by spaces, and D should be the same for various elements.
            D-dimensional real-value labels will be generated for this domain.
            If each sequence has multiple rows, the average of the labels will be recorded.

        2) 'categorical': each element (row) in the corresponding domain should be a strong containing N keywords.
            N-dimensional categorical label will be generated for this domain.
            If each sequence has multiple rows, the aggregation of the categories will be recorded.

    :param database: a dictionary of data generated by the function "load_sequences_csv()"
    :return: a database having sequences' labels
    """
    df = pd.read_csv(file_name)
    num_seq = len(database['seq2idx'])
    # initialize features
    keys = list(domain_dict.keys())
    label_domain = keys[0]
    if len(keys) > 1:
        logger.warning(
            "{} label domains are found. Only the first domain '{}' is used to generate labels."
            .format(len(keys), label_domain))

    features = {}
    counts = {}
    features[label_domain] = None

    logger.info('Start to generate sequence labels...')
    start = time.time()
    for i, row in df.iterrows():
        seq_name = str(row[seq_domain])
        if seq_name not in database['seq2idx'].keys():
            logger.warning(
                "'{}' is a new sequence not appearing in current database.".
                format(seq_name))
            logger.warning("It will be ignored in the process.")
        else:
            seq_idx = database['seq2idx'][seq_name]
            elements = str(row[label_domain])
            if domain_dict[label_domain] == 'numerical':
                elements = np.asarray(list(map(float, elements.split())))
                dim = elements.shape[0]
                if features[label_domain] is None:
                    features[label_domain] = np.zeros((dim, num_seq))
                    features[label_domain][:, seq_idx] = elements
                    counts[label_domain] = np.zeros((1, num_seq))
                    counts[label_domain][0, seq_idx] = 1
                else:
                    features[label_domain][:, seq_idx] += elements
                    counts[label_domain][0, seq_idx] += 1

            elif domain_dict[label_domain] == 'categorical':
                elements = elements.split()
                if features[label_domain] is None:
                    features[label_domain] = {}
                    features[label_domain][seq_idx] = elements
                    counts[label_domain] = {}
                    element_idx = 0
                else:
                    if seq_idx not in features[label_domain].keys():
                        features[label_domain][seq_idx] = elements
                    else:
                        features[label_domain][seq_idx].extend(elements)
                for element in elements:
                    if element not in counts[label_domain].keys():
                        counts[label_domain][element] = element_idx
                        element_idx += 1
            else:
                logger.warning(
                    'Undefined feature type for the domain {}.'.format(
                        label_domain))
                logger.warning("It will be ignored in the process.")
        if i % 1000 == 0:
            logger.info('{} rows have been processed... Time={}ms.'.format(
                i, round(1000 * (time.time() - start))))

    # post-process of features
    start = time.time()
    if domain_dict[label_domain] == 'numerical':
        features_tmp = features[label_domain]
        features_tmp = features_tmp / np.tile(
            counts[label_domain], (features[label_domain].shape[0], 1))
        for seq_idx in range(features_tmp.shape[1]):
            database['sequences'][seq_idx]['label'] = features_tmp[:, seq_idx]

    elif domain_dict[label_domain] == 'categorical':
        for seq_idx in features[label_domain].keys():
            elements = list(set(features[label_domain][seq_idx]))
            feature_tmp = []
            for element in elements:
                element_idx = counts[label_domain][element]
                feature_tmp.append(element_idx)
            feature_tmp = np.asarray(feature_tmp, dtype=np.int)
            database['sequences'][seq_idx]['label'] = feature_tmp
    else:
        logger.warning('Undefined label type for the domain {}.'.format(
            domain_dict[label_domain]))
        logger.warning("It will be ignored in the process.")
    logger.info("Labels of domain '{}' is generated... Time={}ms.".format(
        domain_dict[label_domain], round(1000 * (time.time() - start))))

    return database
コード例 #19
0
ファイル: HawkesProcess.py プロジェクト: zehsilva/PoPPy
    def fit_ot(self, dataloader, optimizer, epochs: int,
               trans: torch.Tensor, mu_t: torch.Tensor, A_t: torch.Tensor, p_s: torch.Tensor, p_t: torch.Tensor,
               sample_dict1, sample_dict2, gamma, alpha,
               scheduler=None, sparsity: float=None, nonnegative=None,
               use_cuda: bool=False, validation_set=None):
        """
        Learn parameters of a generalized Hawkes process given observed sequences
        :param dataloader: a pytorch batch-based data loader
        :param optimizer: the sgd optimization method defined by PyTorch
        :param epochs: the number of training epochs
        :param trans: fixed optimal transport
        :param mu_t: base intensity of target Hawkes process
        :param A_t: infectivity of target Hawkes process
        :param p_s: the distribution of event types in source Hawkes process
        :param p_t: the distribution of event types in target Hawkes process
        :param scheduler: the method adjusting the learning rate of SGD defined by PyTorch
        :param sparsity: None or a float weight of L1 regularizer
        :param nonnegative: None or a float lower bound, typically the lower bound = 0
        :param use_cuda: use cuda (true) or not (false)
        :param validation_set: None or a validation dataloader
        """
        device = torch.device('cuda:0' if use_cuda else 'cpu')
        self.lambda_model.to(device)
        best_model = None
        self.lambda_model.train()

        if nonnegative is not None:
            clipper = LowerBoundClipper(nonnegative)

        Cs = torch.LongTensor(list(range(len(dataloader.dataset.database['type2idx']))))
        Cs = Cs.view(-1, 1)
        Cs = Cs.to(device)

        if dataloader.dataset.database['event_features'] is not None:
            all_event_feature = torch.from_numpy(dataloader.dataset.database['event_features'])
            FCs = all_event_feature.type(torch.FloatTensor)
            FCs = torch.t(FCs)    # (num_type, dim_features)
            FCs = FCs.to(device)
        else:
            FCs = None

        if validation_set is not None:
            validation_loss = self.validation(validation_set, use_cuda)
            logger.info('In the beginning, validation loss per event: {:.6f}.\n'.format(validation_loss))
            best_loss = validation_loss
        else:
            best_loss = np.inf

        for epoch in range(epochs):
            if scheduler is not None:
                scheduler.step()
            start = time.time()
            for batch_idx, samples in enumerate(dataloader):
                ci, batch_dict = samples2dict(samples, device, Cs, FCs)
                optimizer.zero_grad()
                lambda_t, Lambda_t = self.lambda_model(batch_dict)
                loss = self.loss_function(lambda_t, Lambda_t, ci) / lambda_t.size(0)
                reg = 0
                if sparsity is not None:
                    for parameter in self.lambda_model.parameters():
                        reg += sparsity * torch.sum(torch.abs(parameter))

                base_intensity = self.lambda_model.exogenous_intensity.intensity(sample_dict1)
                infectivity = self.lambda_model.endogenous_intensity.granger_causality(sample_dict2).squeeze(2)
                d_gw = self.dgw(infectivity, A_t, trans, p_s, p_t)
                d_w = self.dw(base_intensity, mu_t, trans, p_s, p_t)
                loss_total = loss + reg + gamma * (alpha*d_w + (1-alpha)*d_gw)
                loss_total.backward()
                optimizer.step()
                if nonnegative is not None:
                    self.lambda_model.apply(clipper)

                # display training processes
                if batch_idx % 100 == 0:
                    logger.info('Train Epoch: {} [{}/{} ({:.0f}%)]'.format(
                        epoch, batch_idx * ci.size(0), len(dataloader.dataset), 100. * batch_idx / len(dataloader)))
                    if sparsity is not None:
                        logger.info('Loss per event: {:.6f}, Regularizer: {:.6f} Time={:.2f}sec'.format(
                            loss.data, reg.data, time.time() - start))
                    else:
                        logger.info('Loss per event: {:.6f}, Regularizer: {:.6f} Time={:.2f}sec'.format(
                            loss.data, 0, time.time() - start))

            if validation_set is not None:
                validation_loss = self.validation(validation_set, use_cuda)
                logger.info('After Epoch: {}, validation loss per event: {:.6f}.\n'.format(epoch, validation_loss))
                if validation_loss < best_loss:
                    best_model = copy.deepcopy(self.lambda_model)
                    best_loss = validation_loss

        if best_model is not None:
            self.lambda_model = copy.deepcopy(best_model)
コード例 #20
0
    def fit(self,
            dataloader,
            optimizer,
            epochs: int,
            scheduler=None,
            sparsity: float = None,
            nonnegative=None,
            use_cuda: bool = False,
            validation_set=None):
        """
        Learn parameters of a generalized Hawkes process given observed sequences
        :param dataloader: a pytorch batch-based data loader
        :param optimizer: the sgd optimization method
        :param epochs: the number of training epochs
        :param scheduler: the method adjusting the learning rate of SGD
        :param sparsity: None or a float weight of L1 regularizer
        :param nonnegative: None or a float lower bound
        :param use_cuda: use cuda (true) or not (false)
        :param validation_set: None or a validation dataloader
        """
        device = torch.device('cuda:0' if use_cuda else 'cpu')
        self.lambda_model.to(device)
        self.responsibility = self.responsibility.to(device)
        self.prob_cluster = self.prob_cluster.to(device)
        best_model = None
        self.lambda_model.train()

        if nonnegative is not None:
            clipper = LowerBoundClipper(nonnegative)

        Cs = torch.LongTensor(
            list(range(len(dataloader.dataset.database['type2idx']))))
        Cs = Cs.view(-1, 1)
        Cs = Cs.to(device)

        if dataloader.dataset.database['event_features'] is not None:
            all_event_feature = torch.from_numpy(
                dataloader.dataset.database['event_features'])
            FCs = all_event_feature.type(torch.FloatTensor)
            FCs = torch.t(FCs)  # (num_type, dim_features)
            FCs = FCs.to(device)
        else:
            FCs = None

        if validation_set is not None:
            validation_loss = self.validation(validation_set, use_cuda)
            logger.info(
                'In the beginning, validation loss per event: {:.6f}.\n'.
                format(validation_loss))
            best_loss = validation_loss
        else:
            best_loss = np.inf

        # EM algorithm
        for epoch in range(epochs):
            if scheduler is not None:
                scheduler.step()
            start = time.time()

            log_weight = self.prob_cluster.log().view(1,
                                                      self.num_cluster).repeat(
                                                          self.num_sequence, 1)
            log_responsibility = 0 * self.responsibility
            num_responsibllity = 0 * self.responsibility
            log_responsibility = log_responsibility.to(device)
            num_responsibllity = num_responsibllity.to(device)
            for batch_idx, samples in enumerate(dataloader):
                ci, batch_dict = samples2dict(samples, device, Cs, FCs)
                optimizer.zero_grad()
                loss = 0
                for m in range(self.num_cluster):
                    weight = self.responsibility[batch_dict['sn'][:, 0],
                                                 m]  # (batch_size, )
                    lambda_t, Lambda_t = self.lambda_model[m](batch_dict)
                    loss_m = self.loss_function(lambda_t, Lambda_t,
                                                ci)  # (batch_size, )
                    loss += (weight * loss_m).sum() / loss_m.size(0)
                    for i in range(loss_m.size(0)):
                        sn = batch_dict['sn'][i, 0]
                        log_responsibility[sn, m] += loss_m.data[i]
                        num_responsibllity[sn, m] += 1

                reg = 0
                if sparsity is not None:
                    for parameter in self.lambda_model.parameters():
                        reg += sparsity * torch.sum(torch.abs(parameter))
                loss_total = loss + reg
                loss_total.backward()
                optimizer.step()
                if nonnegative is not None:
                    self.lambda_model.apply(clipper)

                # display training processes
                if batch_idx % 100 == 0:
                    logger.info('Train Epoch: {} [{}/{} ({:.0f}%)]'.format(
                        epoch, batch_idx * ci.size(0), len(dataloader.dataset),
                        100. * batch_idx / len(dataloader)))
                    if sparsity is not None:
                        logger.info(
                            'Loss per event: {:.6f}, Regularizer: {:.6f} Time={:.2f}sec'
                            .format(loss.data, reg.data,
                                    time.time() - start))
                    else:
                        logger.info(
                            'Loss per event: {:.6f}, Regularizer: {:.6f} Time={:.2f}sec'
                            .format(loss.data, 0,
                                    time.time() - start))
                    logger.info('Distribution of clusters')
                    for m in range(self.num_cluster):
                        logger.info('Cluster {}, prob={:.6f}'.format(
                            m, self.prob_cluster[m]))

            # update responsibility
            log_responsibility /= (num_responsibllity + 1e-7)
            self.responsibility = F.softmax(log_responsibility + log_weight,
                                            dim=1)
            self.prob_cluster = self.responsibility.sum(0)
            self.prob_cluster = self.prob_cluster / self.prob_cluster.sum()

            if validation_set is not None:
                validation_loss = self.validation(validation_set, use_cuda)
                logger.info(
                    'After Epoch: {}, validation loss per event: {:.6f}.\n'.
                    format(epoch, validation_loss))
                if validation_loss < best_loss:
                    best_model = copy.deepcopy(self.lambda_model)
                    best_loss = validation_loss

        if best_model is not None:
            self.lambda_model = copy.deepcopy(best_model)
コード例 #21
0
ファイル: DataOperation.py プロジェクト: xiaotinghe/PoPPy
def data_info(database):
    """
    Print basic information of proposed database
    :param database: the database with the format mentioned above
    """
    logger.info('** Statistics of Target Database **')
    logger.info('- The number of event types = {}.'.format(
        len(database['type2idx'])))
    logger.info('- The number of sequences = {}.'.format(
        len(database['seq2idx'])))
    if database['event_features'] is not None:
        logger.info(
            '- Each event has a feature vector with dimension {}.'.format(
                database['event_features'].shape[1]))
    else:
        logger.info('- Event feature is None.')

    if database['sequences'][0]['seq_feature'] is not None:
        logger.info(
            '- Each sequence has a feature vector with dimension {}.'.format(
                database['sequences'][0]['seq_feature'].shape[0]))
    else:
        logger.info('- Sequence feature is None.')

    N_max = 0
    N_min = np.inf
    N_mean = 0
    for i in range(len(database['sequences'])):
        num_event = database['sequences'][i]['events'].shape[0]
        N_mean += num_event
        if num_event < N_min:
            N_min = num_event
        if num_event > N_max:
            N_max = num_event
    N_mean /= len(database['sequences'])
    logger.info('- The longest sequence is with {} events.'.format(N_max))
    logger.info('- The shortest sequence is with {} events.'.format(N_min))
    logger.info(
        '- The average number of events per sequence is {:.2f}.'.format(
            N_mean))
コード例 #22
0
ファイル: DataOperation.py プロジェクト: xiaotinghe/PoPPy
    def __init__(self, database, memorysize: int = None):
        """
        :param database: the observed event sequences
            database = {'event_features': None or (C, De) float array of event's static features,
                                      C is the number of event types.
                        'type2idx': a Dict = {'event_name': event_index}
                        'idx2type': a Dict = {event_index: 'event_name'}
                        'seq2idx': a Dict = {'seq_name': seq_index}
                        'idx2seq': a Dict = {seq_index: 'seq_name'}
                        'sequences': a List  = {seq_1, seq_2, ..., seq_N}.
                        }

            For the i-th sequence:
            seq_i = {'times': (N,) float array of timestamps, N is the number of events.
                     'events': (N,) int array of event types.
                     'seq_feature': None or (Ds,) float array of sequence's static feature.
                     't_start': a float number indicating the start timestamp of the sequence.
                     't_stop': a float number indicating the stop timestamp of the sequence.
                     'label': None or int number indicating the labels of the sequence}
        :param memorysize: how many historical events remembered by each event
            When memorysize = None
                All events in a sequence will be considered.
                In that case, each batch can only contain one sequence because different sequences may have different
                length.
            When memorysize = K
                We only memory the last K events of each sequence.
                For the sequence with <K events, we fill virtual event "0" to the beginning of the sequence.
        """
        self.event_cell = []
        self.time_cell = []
        self.database = database
        self.memory_size = memorysize
        if self.memory_size is None:
            logger.warning(
                "Because memory size is not given, the sampler can only sample 1 sequence per batch."
            )
            logger.warning("Please set batch size = 1 in your code.")

        for i in range(len(database['sequences'])):
            seq_i = database['sequences'][i]
            times = seq_i['times']
            events = seq_i['events']
            t_start = seq_i['t_start']
            target = seq_i['label']
            target_t = seq_i['t_stop']
            if self.memory_size is None:
                former = events
                former_t = times
            else:
                # former = np.zeros((memorysize,), dtype=np.int)
                # former = np.random.permutation(len(self.database['type2idx']))
                # former = former[:memorysize]
                former = np.random.choice(len(self.database['type2idx']),
                                          memorysize)
                former_t = t_start * np.ones((memorysize, ))

                if 0 < times.shape[0] < memorysize:
                    former[-memorysize:] = events
                    former_t[-memorysize:] = times
                else:
                    former = events[-memorysize:]
                    former_t = times[-memorysize:]

            self.event_cell.append((target, former, i))
            self.time_cell.append((target_t, former_t))
        logger.info('In this dataset, the number of sequences = {}.'.format(
            len(self.event_cell)))
コード例 #23
0
ファイル: DataOperation.py プロジェクト: xiaotinghe/PoPPy
def stitching(database1: Dict,
              database2: Dict,
              method: str = 'random') -> Dict:
    """
    Stitch each sequence in database2 to the end of one sequence of database1
    :param database1: the observed event sequences
    :param database2: another observed event sequences
        database = {'event_features': None or (De, C) float array of event's static features,
                                  C is the number of event types.
                    'type2idx': a Dict = {'event_name': event_index}
                    'idx2type': a Dict = {event_index: 'event_name'}
                    'seq2idx': a Dict = {'seq_name': seq_index}
                    'idx2seq': a Dict = {seq_index: 'seq_name'}
                    'sequences': a List  = {seq_1, seq_2, ..., seq_N}.
                    }

        For the i-th sequence:
        seq_i = {'times': (N,) float array of timestamps, N is the number of events.
                 'events': (N,) int array of event types.
                 'seq_feature': None or (Ds,) float array of sequence's static feature.
                 't_start': a float number indicating the start timestamp of the sequence.
                 't_stop': a float number indicating the stop timestamp of the sequence.
                 'label': None or int/float number indicating the labels of the sequence}

    :param method: a string indicates stitching method:
        "random": stitch the seq_j in sequences2 to the seq_i in sequences1 for j ~ {1,...,N}, i=1,...,N and
                  time-shifting is applied to sequences2.
                  This method is suitable for the sequences generated by a same stationary point process.

        "feature": stitch the seq_j in sequences2 to the seq_i in sequences1 for j ~{1,...,N}, i=1,...,N and
                   j is sampled according to the similarity between two sequences.
                   The similarity is calculated by the Gaussian kernel of seq_features, labels and times.
                   When seq_features/labels are not available, only timestamp information are taken into account.

    :return:
        the output sequences are with the same format as database1.
    """
    start = time.time()
    output = copy.deepcopy(database1)
    if database1['type2idx'] == database2['type2idx']:
        if method is None or method == 'random':
            logger.info('random stitching is applied...')
            index = np.random.permutation(
                len(database2['sequences']
                    ))  # random permutation of the index of sequences

            for i in range(len(database1['sequences'])):
                seq_i = database1['sequences'][i]
                j = i % len(database2['sequences'])
                seq_j = database2['sequences'][index[j]]

                # concatenate two timestamp arrays with time shifting
                times1 = seq_i['times']
                times2 = seq_j['times'] - seq_j['t_start'] + seq_i['t_stop']
                output['sequences'][i]['times'] = np.concatenate(
                    (times1, times2), axis=0)

                # concatenate two event arrays
                output['sequences'][i]['events'] = np.concatenate(
                    (seq_i['events'], seq_j['events']), axis=0)

                # update stop timestamp
                output['sequences'][i]['t_stop'] = seq_i['t_stop'] + seq_j[
                    't_stop'] - seq_j['t_start']

                # update features
                if seq_i['seq_feature'] is not None and seq_j[
                        'seq_feature'] is not None:
                    output['sequences'][i]['seq_feature'] = (
                        seq_i['seq_feature'] + seq_j['seq_feature']) / 2

                if i % 1000 == 0:
                    logger.info(
                        '{} sequences have been stitched... Time={}ms.'.format(
                            i, round(1000 * (time.time() - start))))

        elif method == 'feature':
            logger.info('feature-based stitching is applied...')

            for i in range(len(database1['sequences'])):
                prob = np.zeros((len(database2['sequences']), ))
                seq_i = database1['sequences'][i]

                for j in range(len(database2['sequences'])):
                    seq_j = database2['sequences'][j]

                    if seq_j['t_start'] > seq_i['t_stop']:
                        # consider temporal order
                        weight = np.exp(-(seq_j['t_start'] -
                                          seq_i['t_stop'])**2)
                        # consider feature similarity
                        if seq_i['seq_feature'] is not None and seq_j[
                                'seq_feature'] is not None:
                            weight *= np.exp(
                                -np.linalg.norm(seq_i['seq_feature'] -
                                                seq_j['seq_feature'])**2)
                        # consider label consistency
                        if seq_i['label'] is not None and seq_j[
                                'label'] is not None:
                            if seq_i['label'] != seq_j['label']:
                                weight = 0
                    else:
                        weight = 0

                    prob[j] = weight

                # sampling a sequence from database2
                if np.sum(prob) > 0:
                    prob = prob / np.sum(prob)
                else:
                    prob = np.ones((len(database2['sequences']), )) / len(
                        database2['sequences'])

                j = np.random.choice(len(database2['sequences']), p=prob)
                seq_j = database2['sequences'][j]

                # concatenate two timestamp arrays with time shifting
                times1 = seq_i['times']
                times2 = seq_j['times'] - seq_j['t_start'] + seq_i['t_stop']
                output['sequences'][i]['times'] = np.concatenate(
                    (times1, times2), axis=0)

                # concatenate two event arrays
                output['sequences'][i]['events'] = np.concatenate(
                    (seq_i['events'], seq_j['events']), axis=0)

                # update stop timestamp
                output['sequences'][i]['t_stop'] = seq_i['t_stop'] + seq_j[
                    't_stop'] - seq_j['t_start']

                # update features
                if seq_i['seq_feature'] is not None and seq_j[
                        'seq_feature'] is not None:
                    output['sequences'][i]['seq_feature'] = (
                        seq_i['seq_feature'] + seq_j['seq_feature']) / 2

                if i % 1000 == 0:
                    logger.info(
                        '{} sequences have been stitched... Time={}ms.'.format(
                            i, round(1000 * (time.time() - start))))
        else:
            logger.warning('You need to define your own stitching method... '
                           'The function returns the first database.')
    else:
        logger.warning('The two databases do not have the same event types... '
                       'The function returns the first database.')

    return output
コード例 #24
0
ファイル: DataIO.py プロジェクト: nicktianboli/PoPPy
def load_sequences_csv(file_name: str, domain_names: Dict, upperlimit=None):
    """
    Load event sequences from a csv file
    :param file_name: the path and name of the target csv file
    :param domain_names: a dictionary contains the names of the key columns
                         corresponding to {'seq_id', 'time', 'event'}
        The format should be
        domain_names = {'seq_id': the column name of sequence name,
                        'time': the column name of timestamps,
                        'event': the column name of events}
    :return: database: a dictionary containing observed event sequences
        database = {'event_features': None,
                    'type2idx': a Dict = {'event_name': event_index}
                    'idx2type': a Dict = {event_index: 'event_name'}
                    'seq2idx': a Dict = {'seq_name': seq_index}
                    'idx2seq': a Dict = {seq_index: 'seq_name'}
                    'sequences': a List  = [seq_1, seq_2, ..., seq_N].
                    }

        For the i-th sequence:
        seq_i = {'times': (N,) float array of timestamps, N is the number of events.
                 'events': (N,) int array of event types.
                 'seq_feature': None.
                 't_start': a float number, the start timestamp of the sequence.
                 't_stop': a float number, the stop timestamp of the sequence.
                 'label': None
                 }
    """
    database = {
        'event_features': None,
        'type2idx': None,
        'idx2type': None,
        'seq2idx': None,
        'idx2seq': None,
        'sequences': []
    }

    if upperlimit is not None:
        df = pd.read_csv(file_name).iloc[:upperlimit]
    else:
        df = pd.read_csv(file_name)
    type2idx = {}
    idx2type = {}
    seq2idx = {}
    idx2seq = {}

    logger.info('Count the number of sequences...')
    start = time.time()
    seq_idx = 0
    type_idx = 0
    for i, row in df.iterrows():
        seq_name = str(row[domain_names['seq_id']])
        event_type = str(row[domain_names['event']])
        if seq_name not in seq2idx.keys():
            seq2idx[seq_name] = seq_idx
            seq = {
                'times': [],
                'events': [],
                'seq_feature': None,
                't_start': 0.0,
                't_stop': 0.0,
                'label': None
            }
            database['sequences'].append(seq)
            seq_idx += 1

        if event_type not in type2idx.keys():
            type2idx[event_type] = type_idx
            type_idx += 1

        if i % 10000 == 0:
            logger.info('{} events have been processed... Time={}ms.'.format(
                i, round(1000 * (time.time() - start))))

    logger.info(
        'Done! {} sequences with {} event types are found in {}ms'.format(
            seq_idx + 1, type_idx + 1, round(1000 * (time.time() - start))))

    logger.info('Build proposed database for the sequences...')
    start2 = time.time()
    for seq_name in seq2idx.keys():
        seq_idx = seq2idx[seq_name]
        idx2seq[seq_idx] = seq_name

    for event_type in type2idx.keys():
        type_idx = type2idx[event_type]
        idx2type[type_idx] = event_type

    database['type2idx'] = type2idx
    database['idx2type'] = idx2type
    database['seq2idx'] = seq2idx
    database['idx2seq'] = idx2seq

    for i, row in df.iterrows():
        seq_name = str(row[domain_names['seq_id']])
        timestamp = float(row[domain_names['time']])
        event_type = str(row[domain_names['event']])

        seq_idx = database['seq2idx'][seq_name]
        type_idx = database['type2idx'][event_type]
        database['sequences'][seq_idx]['times'].append(timestamp)
        database['sequences'][seq_idx]['events'].append(type_idx)

        if i % 10000 == 0:
            logger.info('{} events have been processed... Time={}ms.'.format(
                i, round(1000 * (time.time() - start2))))
    logger.info('Done! {} sequences are built in {}ms'.format(
        len(database['seq2idx']), round(1000 * (time.time() - start2))))

    logger.info('Format transformation...')
    for n in range(len(database['sequences'])):
        database['sequences'][n]['t_start'] = database['sequences'][n][
            'times'][0]
        database['sequences'][n][
            't_stop'] = database['sequences'][n]['times'][-1] + 1e-2
        database['sequences'][n]['times'] = np.asarray(
            database['sequences'][n]['times'])
        database['sequences'][n]['events'] = np.asarray(
            database['sequences'][n]['events'])
        if n % 1000 == 0:
            logger.info(
                '{} sequences have been processed... Time={}ms.'.format(
                    n, round(1000 * (time.time() - start))))
    logger.info('Done! The database has been built in {}ms'.format(
        round(1000 * (time.time() - start))))

    return database
コード例 #25
0
ファイル: HawkesProcess.py プロジェクト: zehsilva/PoPPy
 def print_info(self):
     logger.info('A generalized Hawkes process intensity:')
     logger.info('Intensity function lambda(t) = {}'.format(self.intensity_type))
     self.exogenous_intensity.print_info()
     self.endogenous_intensity.print_info()
コード例 #26
0
ファイル: DataIO.py プロジェクト: nicktianboli/PoPPy
def load_event_features_csv(file_name: str,
                            event_domain: str,
                            domain_dict: Dict,
                            database: Dict,
                            normalize: int = 0):
    """
    load events' features from a csv file
    :param file_name: the path and the name of the csv file
    :param event_domain: the name of the key column corresponding to event index.
    :param domain_dict: a dictionary containing the names of the key columns corresponding to the features.
        The format should be
            domain_dict = {'domain_name': domain's feature type}
        Two types are considered:
        1) 'numerical': each element (row) in the corresponding domain should be a string containing D numbers
            separated by spaces, and D should be the same for various elements.
            D-dimensional real-value features will be generated for this domain.
            If each event type has multiple rows, the average of the features will be recorded.

        2) 'categorical': each element (row) in the corresponding domain should be a strong containing N keywords
            separated by spaces, but N can be different for various elements.
            D-dimensional binary features will be generated for this domain. Here D is the number of distinguished
            keywords (vocabulary size).
            If each event type has multiple rows, the aggregation of the binary features will be recorded.

    :param database: a dictionary of data generated by the function "load_sequences_csv()"
    :param normalize: 0 = no normalization, 1 = normalization across features, 2 = normalization across event types
    :return: a database having events' features
    """
    df = pd.read_csv(file_name)
    num_event = len(database['type2idx'])

    # initialize features
    features = {}
    counts = {}
    for key in domain_dict.keys():
        features[key] = None
        counts[key] = None

    logger.info('Start to generate sequence features...')
    start = time.time()
    for i, row in df.iterrows():
        event_name = str(row[event_domain])
        if event_name not in database['type2idx'].keys():
            logger.warning(
                "'{}' is a new event type not appearing in current database.".
                format(event_name))
            logger.warning("It will be ignored in the process.")
        else:
            event_idx = database['type2idx'][event_name]
            for key in domain_dict.keys():
                elements = str(row[key])
                if domain_dict[key] == 'numerical':
                    elements = np.asarray(list(map(float, elements.split())))
                    dim = elements.shape[0]
                    if features[key] is None:
                        features[key] = np.zeros((dim, num_event))
                        features[key][:, event_idx] = elements
                        counts[key] = np.zeros((1, num_event))
                        counts[key][0, event_idx] = 1
                        counts[key][0, 0] = 1
                    else:
                        features[key][:, event_idx] += elements
                        counts[key][0, event_idx] += 1

                elif domain_dict[key] == 'categorical':
                    elements = elements.split()
                    if features[key] is None:
                        features[key] = {}
                        features[key][event_idx] = elements
                        counts[key] = {}
                        element_idx = 0
                    else:
                        if event_idx not in features[key].keys():
                            features[key][event_idx] = elements
                        else:
                            features[key][event_idx].extend(elements)
                    for element in elements:
                        if element not in counts[key].keys():
                            counts[key][element] = element_idx
                            element_idx += 1
                else:
                    logger.warning(
                        'Undefined feature type for the domain {}.'.format(
                            key))
                    logger.warning("It will be ignored in the process.")
        if i % 1000 == 0:
            logger.info('{} rows have been processed... Time={}ms'.format(
                i, round(1000 * (time.time() - start))))

    # post-process of features
    features_all = None
    start = time.time()
    for key in domain_dict.keys():
        if domain_dict[key] == 'numerical':
            features_tmp = features[key]
            features_tmp = features_tmp / np.tile(counts[key],
                                                  (features[key].shape[0], 1))
            if features_all is None:
                features_all = features_tmp
            else:
                features_all = np.concatenate((features_all, features_tmp),
                                              axis=0)

        elif domain_dict[key] == 'categorical':
            features_tmp = np.zeros((len(counts[key]), num_event))
            for event_idx in features[key].keys():
                for element in features[key][event_idx]:
                    element_idx = counts[key][element]
                    features_tmp[element_idx, event_idx] += 1
            if features_all is None:
                features_all = features_tmp
            else:
                features_all = np.concatenate((features_all, features_tmp),
                                              axis=0)
        else:
            logger.warning(
                'Undefined feature type for the domain {}.'.format(key))
            logger.warning("It will be ignored in the process.")
        logger.info(
            "features of domain '{}' is generated... Time={}ms.".format(
                key, round(1000 * (time.time() - start))))

    if normalize == 1:
        features_all = features_all / \
                       np.tile(np.sum(features_all, axis=0)+1e-8, (features_all.shape[0], 1))
    if normalize == 2:
        features_all = features_all / \
                       np.transpose(np.tile(np.sum(features_all, axis=1)+1e-8, (features_all.shape[1], 1)))
    database['event_features'] = features_all

    return database