Exemple #1
0
    def __init__(self,
                 exogenous_intensity,
                 endogenous_intensity,
                 activation: str = None,
                 prob=1.0):
        super(HawkesProcessIntensity, self).__init__()
        self.exogenous_intensity = exogenous_intensity
        self.endogenous_intensity = endogenous_intensity
        if activation is None:
            self.intensity_type = "exogenous intensity + endogenous impacts"
            self.activation = 'identity'
        else:
            self.intensity_type = "{}(exogenous intensity + endogenous impacts)".format(
                activation)
            self.activation = activation

        if self.activation == 'relu':
            self.act = nn.ReLU()
        elif self.activation == 'softplus':
            self.act = nn.Softplus(beta=self.num_type**0.5)
        elif self.activation == 'identity':
            self.act = Identity()
        else:
            logger.warning(
                'The actvation layer is {}, which can not be identified... '.
                format(self.activation))
            logger.warning('Identity activation is applied instead.')
            self.act = Identity()
        self.prob = prob
Exemple #2
0
    def integrations(self,
                     t_stop: torch.Tensor,
                     t_start: Optional[torch.Tensor] = None) -> torch.Tensor:
        """
        Calculate the integrations of decay kernel in the interval [t_start, t_stop]
        :param t_stop: a 2D Tensor containing stop timestamps
        :param t_start: a 2D Tensor containing start timestamps, if it is None, it means t_start = 0
        :return:
            gt: the ndarray containing decay kernel's integration values in the interval [t_start, t_stop].
        """
        if t_start is None:
            t_start = 0 * t_stop

        if t_start.size() != t_stop.size():
            logger.warning(
                "The t_start does not have the same shape with t_stop, we set t_start to all zeros"
            )
            t_start = 0 * t_stop

        delay = self.parameters[0, 0]
        bandwidth = self.parameters[1, 0]
        # w = np.sqrt(1 / bandwidth)
        w = torch.sqrt(1 / bandwidth)
        # gt_start = np.exp(-w * (t_start - delay))
        gt_start = (-w * (t_start - delay)).exp()
        gt_start[gt_start > 1] = 1
        # gt_stop = np.exp(-w * (t_stop - delay))
        gt_stop = (-w * (t_stop - delay)).exp()
        gt_stop[gt_stop > 1] = 1

        gt_d = (gt_stop - gt_start) / w  #AdityaCMT: Made Changes
        # gt = np.zeros((gt_d.shape[0], gt_d.shape[1], 1))
        # gt[:, :, 0] = -gt_d
        gt = -gt_d.view(gt_d.size(0), gt_d.size(1), 1)
        return gt
Exemple #3
0
    def __init__(self, num_type, mu_dict, loss_type, use_cuda):
        """
        Initialize generalized Hawkes process
        :param num_type: int, the number of event types.
        :param loss_type: str, the type of loss functions
            The length of the list is the number of modalities of the model
            Each element of the list is the number of event categories for each modality
        """
        self.device = torch.device('cuda:0' if use_cuda else 'cpu')
        self.model_name = 'A Poisson Process'
        self.num_type = num_type
        exogenousIntensity = getattr(model.ExogenousIntensityFamily, mu_dict['model_name'])
        self.lambda_model = exogenousIntensity(num_type, mu_dict['parameter_set'])

        self.loss_type = loss_type
        if self.loss_type == 'mle':
            self.loss_function = MaxLogLike()
        elif self.loss_type == 'ls':
            self.loss_function = LeastSquare()
        elif self.loss_type == 'entropy':
            self.loss_function = CrossEntropy()
        else:
            logger.warning('The loss layer is {}, which is not identified... '.format(self.loss_function))
            logger.warning('Maximum likelihood estimation is applied instead.')
            self.loss_function = MaxLogLike()
    def integrations(self,
                     t_stop: torch.Tensor,
                     t_start: Optional[torch.Tensor] = None) -> torch.Tensor:
        """
        Calculate the integrations of decay kernel in the interval [t_start, t_stop]
        :param t_stop: a 2D Tensor containing stop timestamps
        :param t_start: a 2D Tensor containing start timestamps, if it is None, it means t_start = 0
        :return:
            gt: a Tensor containing decay kernel's integration values in the interval [t_start, t_stop].
        """
        if t_start is None:
            t_start = 0 * t_stop

        if t_start.size() != t_stop.size():
            logger.warning(
                f"The t_start does not have the same shape with t_stop, we set t_start to all zeros"
            )
            t_start = 0 * t_stop

        sigma2 = self.parameters[0, 0]
        w = 1 / sigma2
        gt_start = torch.exp(-0.5 * w * (t_start**2))
        gt_stop = torch.exp(-0.5 * w * (t_stop**2))

        gt_d = gt_stop - gt_start
        gt = -gt_d.view(gt_d.size(0), gt_d.size(1), 1)
        # gt = np.zeros((gt_d.shape[0], gt_d.shape[1], 1))
        # gt[:, :, 0] = - gt_d
        return gt
Exemple #5
0
    def __init__(self, num_type: int, parameter_set: Dict = None):
        """
        Initialize exogenous intensity function: mu(t) = mu, mu in R^{C+1}, C is the number of event type
        :param num_type: for a point process with C types of events, num_type = C+1, in which the first type "0"
                         corresponds to an "empty" type never appearing in the sequence.
        :param parameter_set: a dictionary containing parameters
            parameter_set = {'activation': value = names of activation layers ('identity', 'relu', 'softplus')}
        """
        super(NaiveExogenousIntensity, self).__init__(num_type)
        activation = parameter_set['activation']
        if activation is None:
            self.exogenous_intensity_type = 'constant'
            self.activation = 'identity'
        else:
            self.exogenous_intensity_type = '{}(constant)'.format(activation)
            self.activation = activation

        self.num_type = num_type
        self.dim_embedding = 1
        self.emb = nn.Embedding(self.num_type, self.dim_embedding)
        self.emb.weight = nn.Parameter(
            torch.FloatTensor(self.num_type, self.dim_embedding).uniform_(
                0.01 / self.dim_embedding, 1 / self.dim_embedding))
        if self.activation == 'relu':
            self.act = nn.ReLU()
        elif self.activation == 'softplus':
            self.act = nn.Softplus(beta=self.num_type**0.5)
        elif self.activation == 'identity':
            self.act = Identity()
        else:
            logger.warning(
                'The actvation layer is {}, which can not be identified... '.
                format(self.activation))
            logger.warning('Identity activation is applied instead.')
            self.act = Identity()
    def integrations(self,
                     t_stop: torch.Tensor,
                     t_start: Optional[torch.Tensor] = None) -> torch.Tensor:
        """
        Calculate the integrations of decay kernel in the interval [t_start, t_stop]
        :param t_stop: a 2D Tensor containing stop timestamps
        :param t_start: a 2D Tensor containing start timestamps, if it is None, it means t_start = 0
        :return:
            gt: a Tensor containing decay kernel's integration values in the interval [t_start, t_stop].
        """
        if t_start is None:
            t_start = 0 * t_stop

        if t_start.size() != t_stop.size():
            logger.warning(
                f"The t_start does not have the same shape with t_stop, we set t_start to all zeros"
            )
            t_start = 0 * t_stop

        landmarks = self.parameters[0, :]
        sigma2 = self.parameters[1, :]
        gt = 0 * t_stop.unsqueeze(2).repeat(1, 1, self.parameters.size(1))
        # gt = torch.zeros(t_stop.size(0), t_stop.size(1), self.parameters.size(1))
        for i in range(self.parameters.shape[1]):
            gt_start = 0.5 * (1 + torch.erf(
                (t_start - landmarks[i]) / (torch.sqrt(2 * sigma2[i]))))
            gt_stop = 0.5 * (1 + torch.erf(
                (t_stop - landmarks[i]) / (torch.sqrt(2 * sigma2[i]))))
            gt[:, :, i] = gt_stop - gt_start
        return gt
    def integrations(self,
                     t_stop: torch.Tensor,
                     t_start: Optional[torch.Tensor] = None) -> torch.Tensor:
        """
        Calculate the integrations of decay kernel in the interval [t_start, t_stop]
        :param t_stop: a 2D Tensor containing stop timestamps
        :param t_start: a 2D Tensor containing start timestamps, if it is None, it means t_start = 0
        :return:
            gt: a Tensor containing decay kernel's integration values in the interval [t_start, t_stop].
        """
        if t_start is None:
            t_start = 0 * t_stop

        if t_start.size() != t_stop.size():
            logger.warning(
                f"The t_start does not have the same shape with t_stop, we set t_start to all zeros"
            )
            t_start = 0 * t_stop

        delay = self.parameters[0, 0]
        bandwidth = self.parameters[1, 0]
        w = 1 / bandwidth
        gt_start = w * (t_start - delay)
        gt_start[gt_start < 0] = 0
        gt_start[gt_start > 1] = 1

        gt_stop = w * (t_stop - delay)
        gt_stop[gt_stop < 0] = 0
        gt_stop[gt_stop > 1] = 1

        gt_d = gt_stop - gt_start
        gt = gt_d.view(gt_d.size(0), gt_d.size(1), 1)
        # gt = np.zeros((gt_d.shape[0], gt_d.shape[1], 1))
        # gt[:, :, 0] = gt_d
        return gt
Exemple #8
0
    def __init__(self, num_type: int, kernel, parameter_set: Dict):
        """
        Initialize endogenous impact: phi_{kk'}(t) = sum_{m} a_{kk'm} kernel_m(t),
        for m = 1, ..., M, A_m = [a_{kk'm}] in R^{C*C+1}, C is the number of event type
        :param num_type: for a point process with C types of events, num_type = C+1, in which the first type "0"
                         corresponds to an "empty" type never appearing in the sequence.
        :param kernel: an instance of a decay kernel class in "DecayKernelFamily"
        :param parameter_set: a dictionary containing parameters
            parameter_set = {'activation': value = names of activation layers ('identity', 'relu', 'softplus')
                             'dim_feature': value = the dimension of feature vector (embedding)}
        """
        super(FactorizedEndogenousImpact, self).__init__(num_type, kernel)
        activation = parameter_set['activation']
        dim_embedding = parameter_set['dim_embedding']
        if activation is None:
            self.endogenous_impact_type = "sum_m (u_{cm}^T * v_{c'm}) * kernel_m(t)"
            self.activation = 'identity'
        else:
            self.endogenous_impact_type = "sum_m {}(u_(cm)^T * v_(c'm)) * kernel_m(t))".format(activation)
            self.activation = activation

        self.decay_kernel = kernel
        self.num_base = self.decay_kernel.parameters.shape[1]
        self.num_type_u = num_type
        self.num_type_v = num_type
        self.dim_embedding = dim_embedding
        for m in range(self.num_base):
            emb_u = nn.Embedding(self.num_type_u, self.dim_embedding)
            emb_v = nn.Embedding(self.num_type_v, self.dim_embedding)
            emb_u.weight = nn.Parameter(
                           torch.FloatTensor(self.num_type_u, self.dim_embedding).uniform_(
                               0.01 / self.dim_embedding,
                               1 / self.dim_embedding))
            emb_v.weight = nn.Parameter(
                           torch.FloatTensor(self.num_type_v, self.dim_embedding).uniform_(
                               0.01 / self.dim_embedding,
                               1 / self.dim_embedding))
            if m == 0:
                self.basis_u = nn.ModuleList([emb_u])
                self.basis_v = nn.ModuleList([emb_v])
            else:
                self.basis_u.append(emb_u)
                self.basis_v.append(emb_v)

        if self.activation == 'relu':
            self.act = nn.ReLU()
        elif self.activation == 'softplus':
            self.act = nn.Softplus(beta=self.num_type**0.5)
        elif self.activation == 'identity':
            self.act = Identity()
        else:
            logger.warning('The actvation layer is {}, which can not be identified... '.format(self.activation))
            logger.warning('Identity activation is applied instead.')
            self.act = Identity()
Exemple #9
0
 def load_model(self, full_path, mode: str='entire'):
     """
     Load pre-trained model
     :param full_path: the path of directory
     :param mode: 'parameter' for saving only parameters of the model,
                  'entire' for saving entire model
     """
     if mode == 'entire':
         self.lambda_model = torch.load(full_path)
     elif mode == 'parameter':
         self.lambda_model.load_state_dict(torch.load(full_path))
     else:
         logger.warning("'{}' is a undefined mode, we use 'entire' mode instead.".format(mode))
         self.lambda_model = torch.load(full_path)
    def integrations(self,
                     t_stop: torch.Tensor,
                     t_start: Optional[torch.Tensor] = None) -> torch.Tensor:
        """
        Calculate the integrations of decay kernel in the interval [t_start, t_stop]
        :param t_stop: a 2D Tensor containing stop timestamps
        :param t_start: a 2D Tensor containing start timestamps, if it is None, it means t_start = 0
        :return:
            gt: a Tensor containing decay kernel's integration values in the interval [t_start, t_stop].
        """
        if t_start is None:
            t_start = 0 * t_stop

        if t_start.size() != t_stop.size():
            logger.warning(
                f"The t_start does not have the same shape with t_stop, we set t_start to all zeros"
            )
            t_start = 0 * t_stop

        delay = self.parameters[0, 0]
        bandwidth = self.parameters[1, 0]

        # condition 1
        dt_start1 = copy.deepcopy(t_start)
        dt_start1[t_start > delay] = 0
        gt_start1 = dt_start1 * (bandwidth - 1) / delay
        # condition 2
        dt_start2 = copy.deepcopy(t_start)
        dt_start2[t_start <= delay] = delay
        gt_start2 = 1 - (delay / dt_start2)**(bandwidth - 1) + (bandwidth - 1)
        gt_start2[t_start <= delay] = 0
        gt_start = gt_start1 + gt_start2

        # condition 1
        dt_stop1 = copy.deepcopy(t_stop)
        dt_stop1[t_stop > delay] = 0
        gt_stop1 = dt_stop1 * (bandwidth - 1) / delay
        # condition 2
        dt_stop2 = copy.deepcopy(t_stop)
        dt_stop2[t_stop <= delay] = delay
        gt_stop2 = 1 - (delay / dt_stop2)**(bandwidth - 1) + (bandwidth - 1)
        gt_stop2[t_stop <= delay] = 0
        gt_stop = gt_stop1 + gt_stop2

        gt_d = gt_stop - gt_start
        gt = gt_d.view(gt_d.size(0), gt_d.size(1), 1)
        # gt = np.zeros((gt_d.shape[0], gt_d.shape[1], 1))
        # gt[:, :, 0] = gt_d
        return gt
Exemple #11
0
 def save_model(self, full_path, mode: str = 'entire'):
     """
     Save trained model
     :param full_path: the path of directory
     :param mode: 'parameter' for saving only parameters of the model, 'entire' for saving entire model
     """
     if mode == 'entire':
         torch.save(self.lambda_model, full_path)
     elif mode == 'parameter':
         torch.save(self.lambda_model.state_dict(), full_path)
     else:
         logger.warning(
             "'{}' is a undefined mode, we use 'entire' mode instead.".
             format(mode))
         torch.save(self.lambda_model, full_path)
Exemple #12
0
    def __init__(self, database, memorysize: int = None):
        """
        :param database: the observed event sequences
            database = {'event_features': None or (C, De) float array of event's static features,
                                      C is the number of event types.
                        'type2idx': a Dict = {'event_name': event_index}
                        'idx2type': a Dict = {event_index: 'event_name'}
                        'seq2idx': a Dict = {'seq_name': seq_index}
                        'idx2seq': a Dict = {seq_index: 'seq_name'}
                        'sequences': a List  = {seq_1, seq_2, ..., seq_N}.
                        }

            For the i-th sequence:
            seq_i = {'times': (N,) float array of timestamps, N is the number of events.
                     'events': (N,) int array of event types.
                     'seq_feature': None or (Ds,) float array of sequence's static feature.
                     't_start': a float number indicating the start timestamp of the sequence.
                     't_stop': a float number indicating the stop timestamp of the sequence.
                     'label': None or int number indicating the labels of the sequence}
        :param memorysize: how many historical events remembered by each event
            When memorysize = None
                All events in a sequence will be considered.
                In that case, each batch can only contain one sequence because different sequences may have different
                length.
            When memorysize = K
                We only memory the last K events of each sequence.
                For the sequence with <K events, we fill virtual event "0" to the beginning of the sequence.
        """
        self.event_cell = []
        self.time_cell = []
        self.database = database
        self.memory_size = memorysize
        if self.memory_size is None:
            logger.warning(
                "Because memory size is not given, the sampler can only sample 1 sequence per batch."
            )
            logger.warning("Please set batch size = 1 in your code.")

        for i in range(len(database['sequences'])):
            seq_i = database['sequences'][i]
            times = seq_i['times']
            events = seq_i['events']
            t_start = seq_i['t_start']
            target = seq_i['label']
            target_t = seq_i['t_stop']
            if self.memory_size is None:
                former = events
                former_t = times
            else:
                # former = np.zeros((memorysize,), dtype=np.int)
                # former = np.random.permutation(len(self.database['type2idx']))
                # former = former[:memorysize]
                former = np.random.choice(len(self.database['type2idx']),
                                          memorysize)
                former_t = t_start * np.ones((memorysize, ))

                if 0 < times.shape[0] < memorysize:
                    former[-memorysize:] = events
                    former_t[-memorysize:] = times
                else:
                    former = events[-memorysize:]
                    former_t = times[-memorysize:]

            self.event_cell.append((target, former, i))
            self.time_cell.append((target_t, former_t))
        logger.info('In this dataset, the number of sequences = {}.'.format(
            len(self.event_cell)))
Exemple #13
0
def stitching(database1: Dict,
              database2: Dict,
              method: str = 'random') -> Dict:
    """
    Stitch each sequence in database2 to the end of one sequence of database1
    :param database1: the observed event sequences
    :param database2: another observed event sequences
        database = {'event_features': None or (De, C) float array of event's static features,
                                  C is the number of event types.
                    'type2idx': a Dict = {'event_name': event_index}
                    'idx2type': a Dict = {event_index: 'event_name'}
                    'seq2idx': a Dict = {'seq_name': seq_index}
                    'idx2seq': a Dict = {seq_index: 'seq_name'}
                    'sequences': a List  = {seq_1, seq_2, ..., seq_N}.
                    }

        For the i-th sequence:
        seq_i = {'times': (N,) float array of timestamps, N is the number of events.
                 'events': (N,) int array of event types.
                 'seq_feature': None or (Ds,) float array of sequence's static feature.
                 't_start': a float number indicating the start timestamp of the sequence.
                 't_stop': a float number indicating the stop timestamp of the sequence.
                 'label': None or int/float number indicating the labels of the sequence}

    :param method: a string indicates stitching method:
        "random": stitch the seq_j in sequences2 to the seq_i in sequences1 for j ~ {1,...,N}, i=1,...,N and
                  time-shifting is applied to sequences2.
                  This method is suitable for the sequences generated by a same stationary point process.

        "feature": stitch the seq_j in sequences2 to the seq_i in sequences1 for j ~{1,...,N}, i=1,...,N and
                   j is sampled according to the similarity between two sequences.
                   The similarity is calculated by the Gaussian kernel of seq_features, labels and times.
                   When seq_features/labels are not available, only timestamp information are taken into account.

    :return:
        the output sequences are with the same format as database1.
    """
    start = time.time()
    output = copy.deepcopy(database1)
    if database1['type2idx'] == database2['type2idx']:
        if method is None or method == 'random':
            logger.info('random stitching is applied...')
            index = np.random.permutation(
                len(database2['sequences']
                    ))  # random permutation of the index of sequences

            for i in range(len(database1['sequences'])):
                seq_i = database1['sequences'][i]
                j = i % len(database2['sequences'])
                seq_j = database2['sequences'][index[j]]

                # concatenate two timestamp arrays with time shifting
                times1 = seq_i['times']
                times2 = seq_j['times'] - seq_j['t_start'] + seq_i['t_stop']
                output['sequences'][i]['times'] = np.concatenate(
                    (times1, times2), axis=0)

                # concatenate two event arrays
                output['sequences'][i]['events'] = np.concatenate(
                    (seq_i['events'], seq_j['events']), axis=0)

                # update stop timestamp
                output['sequences'][i]['t_stop'] = seq_i['t_stop'] + seq_j[
                    't_stop'] - seq_j['t_start']

                # update features
                if seq_i['seq_feature'] is not None and seq_j[
                        'seq_feature'] is not None:
                    output['sequences'][i]['seq_feature'] = (
                        seq_i['seq_feature'] + seq_j['seq_feature']) / 2

                if i % 1000 == 0:
                    logger.info(
                        '{} sequences have been stitched... Time={}ms.'.format(
                            i, round(1000 * (time.time() - start))))

        elif method == 'feature':
            logger.info('feature-based stitching is applied...')

            for i in range(len(database1['sequences'])):
                prob = np.zeros((len(database2['sequences']), ))
                seq_i = database1['sequences'][i]

                for j in range(len(database2['sequences'])):
                    seq_j = database2['sequences'][j]

                    if seq_j['t_start'] > seq_i['t_stop']:
                        # consider temporal order
                        weight = np.exp(-(seq_j['t_start'] -
                                          seq_i['t_stop'])**2)
                        # consider feature similarity
                        if seq_i['seq_feature'] is not None and seq_j[
                                'seq_feature'] is not None:
                            weight *= np.exp(
                                -np.linalg.norm(seq_i['seq_feature'] -
                                                seq_j['seq_feature'])**2)
                        # consider label consistency
                        if seq_i['label'] is not None and seq_j[
                                'label'] is not None:
                            if seq_i['label'] != seq_j['label']:
                                weight = 0
                    else:
                        weight = 0

                    prob[j] = weight

                # sampling a sequence from database2
                if np.sum(prob) > 0:
                    prob = prob / np.sum(prob)
                else:
                    prob = np.ones((len(database2['sequences']), )) / len(
                        database2['sequences'])

                j = np.random.choice(len(database2['sequences']), p=prob)
                seq_j = database2['sequences'][j]

                # concatenate two timestamp arrays with time shifting
                times1 = seq_i['times']
                times2 = seq_j['times'] - seq_j['t_start'] + seq_i['t_stop']
                output['sequences'][i]['times'] = np.concatenate(
                    (times1, times2), axis=0)

                # concatenate two event arrays
                output['sequences'][i]['events'] = np.concatenate(
                    (seq_i['events'], seq_j['events']), axis=0)

                # update stop timestamp
                output['sequences'][i]['t_stop'] = seq_i['t_stop'] + seq_j[
                    't_stop'] - seq_j['t_start']

                # update features
                if seq_i['seq_feature'] is not None and seq_j[
                        'seq_feature'] is not None:
                    output['sequences'][i]['seq_feature'] = (
                        seq_i['seq_feature'] + seq_j['seq_feature']) / 2

                if i % 1000 == 0:
                    logger.info(
                        '{} sequences have been stitched... Time={}ms.'.format(
                            i, round(1000 * (time.time() - start))))
        else:
            logger.warning('You need to define your own stitching method... '
                           'The function returns the first database.')
    else:
        logger.warning('The two databases do not have the same event types... '
                       'The function returns the first database.')

    return output
Exemple #14
0
def load_seq_labels_csv(file_name: str, seq_domain: str, domain_dict: Dict,
                        database: Dict):
    """
    load sequences' features from a csv file
    :param file_name: the path and the name of the csv file
    :param seq_domain: the name of the key column corresponding to sequence index.
    :param domain_dict: a dictionary containing the name of the key column corresponding to the labels.
        The format should be
            domain_dict = {'domain_name': domain's feature type}
        The dictionary should only contain one key.
        If multiple keys are provided, only the first one is considered.

        Two types are considered:
        1) 'numerical': each element (row) in the corresponding domain should be a string containing D numbers
            separated by spaces, and D should be the same for various elements.
            D-dimensional real-value labels will be generated for this domain.
            If each sequence has multiple rows, the average of the labels will be recorded.

        2) 'categorical': each element (row) in the corresponding domain should be a strong containing N keywords.
            N-dimensional categorical label will be generated for this domain.
            If each sequence has multiple rows, the aggregation of the categories will be recorded.

    :param database: a dictionary of data generated by the function "load_sequences_csv()"
    :return: a database having sequences' labels
    """
    df = pd.read_csv(file_name)
    num_seq = len(database['seq2idx'])
    # initialize features
    keys = list(domain_dict.keys())
    label_domain = keys[0]
    if len(keys) > 1:
        logger.warning(
            "{} label domains are found. Only the first domain '{}' is used to generate labels."
            .format(len(keys), label_domain))

    features = {}
    counts = {}
    features[label_domain] = None

    logger.info('Start to generate sequence labels...')
    start = time.time()
    for i, row in df.iterrows():
        seq_name = str(row[seq_domain])
        if seq_name not in database['seq2idx'].keys():
            logger.warning(
                "'{}' is a new sequence not appearing in current database.".
                format(seq_name))
            logger.warning("It will be ignored in the process.")
        else:
            seq_idx = database['seq2idx'][seq_name]
            elements = str(row[label_domain])
            if domain_dict[label_domain] == 'numerical':
                elements = np.asarray(list(map(float, elements.split())))
                dim = elements.shape[0]
                if features[label_domain] is None:
                    features[label_domain] = np.zeros((dim, num_seq))
                    features[label_domain][:, seq_idx] = elements
                    counts[label_domain] = np.zeros((1, num_seq))
                    counts[label_domain][0, seq_idx] = 1
                else:
                    features[label_domain][:, seq_idx] += elements
                    counts[label_domain][0, seq_idx] += 1

            elif domain_dict[label_domain] == 'categorical':
                elements = elements.split()
                if features[label_domain] is None:
                    features[label_domain] = {}
                    features[label_domain][seq_idx] = elements
                    counts[label_domain] = {}
                    element_idx = 0
                else:
                    if seq_idx not in features[label_domain].keys():
                        features[label_domain][seq_idx] = elements
                    else:
                        features[label_domain][seq_idx].extend(elements)
                for element in elements:
                    if element not in counts[label_domain].keys():
                        counts[label_domain][element] = element_idx
                        element_idx += 1
            else:
                logger.warning(
                    'Undefined feature type for the domain {}.'.format(
                        label_domain))
                logger.warning("It will be ignored in the process.")
        if i % 1000 == 0:
            logger.info('{} rows have been processed... Time={}ms.'.format(
                i, round(1000 * (time.time() - start))))

    # post-process of features
    start = time.time()
    if domain_dict[label_domain] == 'numerical':
        features_tmp = features[label_domain]
        features_tmp = features_tmp / np.tile(
            counts[label_domain], (features[label_domain].shape[0], 1))
        for seq_idx in range(features_tmp.shape[1]):
            database['sequences'][seq_idx]['label'] = features_tmp[:, seq_idx]

    elif domain_dict[label_domain] == 'categorical':
        for seq_idx in features[label_domain].keys():
            elements = list(set(features[label_domain][seq_idx]))
            feature_tmp = []
            for element in elements:
                element_idx = counts[label_domain][element]
                feature_tmp.append(element_idx)
            feature_tmp = np.asarray(feature_tmp, dtype=np.int)
            database['sequences'][seq_idx]['label'] = feature_tmp
    else:
        logger.warning('Undefined label type for the domain {}.'.format(
            domain_dict[label_domain]))
        logger.warning("It will be ignored in the process.")
    logger.info("Labels of domain '{}' is generated... Time={}ms.".format(
        domain_dict[label_domain], round(1000 * (time.time() - start))))

    return database
Exemple #15
0
def load_event_features_csv(file_name: str,
                            event_domain: str,
                            domain_dict: Dict,
                            database: Dict,
                            normalize: int = 0):
    """
    load events' features from a csv file
    :param file_name: the path and the name of the csv file
    :param event_domain: the name of the key column corresponding to event index.
    :param domain_dict: a dictionary containing the names of the key columns corresponding to the features.
        The format should be
            domain_dict = {'domain_name': domain's feature type}
        Two types are considered:
        1) 'numerical': each element (row) in the corresponding domain should be a string containing D numbers
            separated by spaces, and D should be the same for various elements.
            D-dimensional real-value features will be generated for this domain.
            If each event type has multiple rows, the average of the features will be recorded.

        2) 'categorical': each element (row) in the corresponding domain should be a strong containing N keywords
            separated by spaces, but N can be different for various elements.
            D-dimensional binary features will be generated for this domain. Here D is the number of distinguished
            keywords (vocabulary size).
            If each event type has multiple rows, the aggregation of the binary features will be recorded.

    :param database: a dictionary of data generated by the function "load_sequences_csv()"
    :param normalize: 0 = no normalization, 1 = normalization across features, 2 = normalization across event types
    :return: a database having events' features
    """
    df = pd.read_csv(file_name)
    num_event = len(database['type2idx'])

    # initialize features
    features = {}
    counts = {}
    for key in domain_dict.keys():
        features[key] = None
        counts[key] = None

    logger.info('Start to generate sequence features...')
    start = time.time()
    for i, row in df.iterrows():
        event_name = str(row[event_domain])
        if event_name not in database['type2idx'].keys():
            logger.warning(
                "'{}' is a new event type not appearing in current database.".
                format(event_name))
            logger.warning("It will be ignored in the process.")
        else:
            event_idx = database['type2idx'][event_name]
            for key in domain_dict.keys():
                elements = str(row[key])
                if domain_dict[key] == 'numerical':
                    elements = np.asarray(list(map(float, elements.split())))
                    dim = elements.shape[0]
                    if features[key] is None:
                        features[key] = np.zeros((dim, num_event))
                        features[key][:, event_idx] = elements
                        counts[key] = np.zeros((1, num_event))
                        counts[key][0, event_idx] = 1
                        counts[key][0, 0] = 1
                    else:
                        features[key][:, event_idx] += elements
                        counts[key][0, event_idx] += 1

                elif domain_dict[key] == 'categorical':
                    elements = elements.split()
                    if features[key] is None:
                        features[key] = {}
                        features[key][event_idx] = elements
                        counts[key] = {}
                        element_idx = 0
                    else:
                        if event_idx not in features[key].keys():
                            features[key][event_idx] = elements
                        else:
                            features[key][event_idx].extend(elements)
                    for element in elements:
                        if element not in counts[key].keys():
                            counts[key][element] = element_idx
                            element_idx += 1
                else:
                    logger.warning(
                        'Undefined feature type for the domain {}.'.format(
                            key))
                    logger.warning("It will be ignored in the process.")
        if i % 1000 == 0:
            logger.info('{} rows have been processed... Time={}ms'.format(
                i, round(1000 * (time.time() - start))))

    # post-process of features
    features_all = None
    start = time.time()
    for key in domain_dict.keys():
        if domain_dict[key] == 'numerical':
            features_tmp = features[key]
            features_tmp = features_tmp / np.tile(counts[key],
                                                  (features[key].shape[0], 1))
            if features_all is None:
                features_all = features_tmp
            else:
                features_all = np.concatenate((features_all, features_tmp),
                                              axis=0)

        elif domain_dict[key] == 'categorical':
            features_tmp = np.zeros((len(counts[key]), num_event))
            for event_idx in features[key].keys():
                for element in features[key][event_idx]:
                    element_idx = counts[key][element]
                    features_tmp[element_idx, event_idx] += 1
            if features_all is None:
                features_all = features_tmp
            else:
                features_all = np.concatenate((features_all, features_tmp),
                                              axis=0)
        else:
            logger.warning(
                'Undefined feature type for the domain {}.'.format(key))
            logger.warning("It will be ignored in the process.")
        logger.info(
            "features of domain '{}' is generated... Time={}ms.".format(
                key, round(1000 * (time.time() - start))))

    if normalize == 1:
        features_all = features_all / \
                       np.tile(np.sum(features_all, axis=0)+1e-8, (features_all.shape[0], 1))
    if normalize == 2:
        features_all = features_all / \
                       np.transpose(np.tile(np.sum(features_all, axis=1)+1e-8, (features_all.shape[1], 1)))
    database['event_features'] = features_all

    return database