def __init__(self, exogenous_intensity, endogenous_intensity, activation: str = None, prob=1.0): super(HawkesProcessIntensity, self).__init__() self.exogenous_intensity = exogenous_intensity self.endogenous_intensity = endogenous_intensity if activation is None: self.intensity_type = "exogenous intensity + endogenous impacts" self.activation = 'identity' else: self.intensity_type = "{}(exogenous intensity + endogenous impacts)".format( activation) self.activation = activation if self.activation == 'relu': self.act = nn.ReLU() elif self.activation == 'softplus': self.act = nn.Softplus(beta=self.num_type**0.5) elif self.activation == 'identity': self.act = Identity() else: logger.warning( 'The actvation layer is {}, which can not be identified... '. format(self.activation)) logger.warning('Identity activation is applied instead.') self.act = Identity() self.prob = prob
def integrations(self, t_stop: torch.Tensor, t_start: Optional[torch.Tensor] = None) -> torch.Tensor: """ Calculate the integrations of decay kernel in the interval [t_start, t_stop] :param t_stop: a 2D Tensor containing stop timestamps :param t_start: a 2D Tensor containing start timestamps, if it is None, it means t_start = 0 :return: gt: the ndarray containing decay kernel's integration values in the interval [t_start, t_stop]. """ if t_start is None: t_start = 0 * t_stop if t_start.size() != t_stop.size(): logger.warning( "The t_start does not have the same shape with t_stop, we set t_start to all zeros" ) t_start = 0 * t_stop delay = self.parameters[0, 0] bandwidth = self.parameters[1, 0] # w = np.sqrt(1 / bandwidth) w = torch.sqrt(1 / bandwidth) # gt_start = np.exp(-w * (t_start - delay)) gt_start = (-w * (t_start - delay)).exp() gt_start[gt_start > 1] = 1 # gt_stop = np.exp(-w * (t_stop - delay)) gt_stop = (-w * (t_stop - delay)).exp() gt_stop[gt_stop > 1] = 1 gt_d = (gt_stop - gt_start) / w #AdityaCMT: Made Changes # gt = np.zeros((gt_d.shape[0], gt_d.shape[1], 1)) # gt[:, :, 0] = -gt_d gt = -gt_d.view(gt_d.size(0), gt_d.size(1), 1) return gt
def __init__(self, num_type, mu_dict, loss_type, use_cuda): """ Initialize generalized Hawkes process :param num_type: int, the number of event types. :param loss_type: str, the type of loss functions The length of the list is the number of modalities of the model Each element of the list is the number of event categories for each modality """ self.device = torch.device('cuda:0' if use_cuda else 'cpu') self.model_name = 'A Poisson Process' self.num_type = num_type exogenousIntensity = getattr(model.ExogenousIntensityFamily, mu_dict['model_name']) self.lambda_model = exogenousIntensity(num_type, mu_dict['parameter_set']) self.loss_type = loss_type if self.loss_type == 'mle': self.loss_function = MaxLogLike() elif self.loss_type == 'ls': self.loss_function = LeastSquare() elif self.loss_type == 'entropy': self.loss_function = CrossEntropy() else: logger.warning('The loss layer is {}, which is not identified... '.format(self.loss_function)) logger.warning('Maximum likelihood estimation is applied instead.') self.loss_function = MaxLogLike()
def integrations(self, t_stop: torch.Tensor, t_start: Optional[torch.Tensor] = None) -> torch.Tensor: """ Calculate the integrations of decay kernel in the interval [t_start, t_stop] :param t_stop: a 2D Tensor containing stop timestamps :param t_start: a 2D Tensor containing start timestamps, if it is None, it means t_start = 0 :return: gt: a Tensor containing decay kernel's integration values in the interval [t_start, t_stop]. """ if t_start is None: t_start = 0 * t_stop if t_start.size() != t_stop.size(): logger.warning( f"The t_start does not have the same shape with t_stop, we set t_start to all zeros" ) t_start = 0 * t_stop sigma2 = self.parameters[0, 0] w = 1 / sigma2 gt_start = torch.exp(-0.5 * w * (t_start**2)) gt_stop = torch.exp(-0.5 * w * (t_stop**2)) gt_d = gt_stop - gt_start gt = -gt_d.view(gt_d.size(0), gt_d.size(1), 1) # gt = np.zeros((gt_d.shape[0], gt_d.shape[1], 1)) # gt[:, :, 0] = - gt_d return gt
def __init__(self, num_type: int, parameter_set: Dict = None): """ Initialize exogenous intensity function: mu(t) = mu, mu in R^{C+1}, C is the number of event type :param num_type: for a point process with C types of events, num_type = C+1, in which the first type "0" corresponds to an "empty" type never appearing in the sequence. :param parameter_set: a dictionary containing parameters parameter_set = {'activation': value = names of activation layers ('identity', 'relu', 'softplus')} """ super(NaiveExogenousIntensity, self).__init__(num_type) activation = parameter_set['activation'] if activation is None: self.exogenous_intensity_type = 'constant' self.activation = 'identity' else: self.exogenous_intensity_type = '{}(constant)'.format(activation) self.activation = activation self.num_type = num_type self.dim_embedding = 1 self.emb = nn.Embedding(self.num_type, self.dim_embedding) self.emb.weight = nn.Parameter( torch.FloatTensor(self.num_type, self.dim_embedding).uniform_( 0.01 / self.dim_embedding, 1 / self.dim_embedding)) if self.activation == 'relu': self.act = nn.ReLU() elif self.activation == 'softplus': self.act = nn.Softplus(beta=self.num_type**0.5) elif self.activation == 'identity': self.act = Identity() else: logger.warning( 'The actvation layer is {}, which can not be identified... '. format(self.activation)) logger.warning('Identity activation is applied instead.') self.act = Identity()
def integrations(self, t_stop: torch.Tensor, t_start: Optional[torch.Tensor] = None) -> torch.Tensor: """ Calculate the integrations of decay kernel in the interval [t_start, t_stop] :param t_stop: a 2D Tensor containing stop timestamps :param t_start: a 2D Tensor containing start timestamps, if it is None, it means t_start = 0 :return: gt: a Tensor containing decay kernel's integration values in the interval [t_start, t_stop]. """ if t_start is None: t_start = 0 * t_stop if t_start.size() != t_stop.size(): logger.warning( f"The t_start does not have the same shape with t_stop, we set t_start to all zeros" ) t_start = 0 * t_stop landmarks = self.parameters[0, :] sigma2 = self.parameters[1, :] gt = 0 * t_stop.unsqueeze(2).repeat(1, 1, self.parameters.size(1)) # gt = torch.zeros(t_stop.size(0), t_stop.size(1), self.parameters.size(1)) for i in range(self.parameters.shape[1]): gt_start = 0.5 * (1 + torch.erf( (t_start - landmarks[i]) / (torch.sqrt(2 * sigma2[i])))) gt_stop = 0.5 * (1 + torch.erf( (t_stop - landmarks[i]) / (torch.sqrt(2 * sigma2[i])))) gt[:, :, i] = gt_stop - gt_start return gt
def integrations(self, t_stop: torch.Tensor, t_start: Optional[torch.Tensor] = None) -> torch.Tensor: """ Calculate the integrations of decay kernel in the interval [t_start, t_stop] :param t_stop: a 2D Tensor containing stop timestamps :param t_start: a 2D Tensor containing start timestamps, if it is None, it means t_start = 0 :return: gt: a Tensor containing decay kernel's integration values in the interval [t_start, t_stop]. """ if t_start is None: t_start = 0 * t_stop if t_start.size() != t_stop.size(): logger.warning( f"The t_start does not have the same shape with t_stop, we set t_start to all zeros" ) t_start = 0 * t_stop delay = self.parameters[0, 0] bandwidth = self.parameters[1, 0] w = 1 / bandwidth gt_start = w * (t_start - delay) gt_start[gt_start < 0] = 0 gt_start[gt_start > 1] = 1 gt_stop = w * (t_stop - delay) gt_stop[gt_stop < 0] = 0 gt_stop[gt_stop > 1] = 1 gt_d = gt_stop - gt_start gt = gt_d.view(gt_d.size(0), gt_d.size(1), 1) # gt = np.zeros((gt_d.shape[0], gt_d.shape[1], 1)) # gt[:, :, 0] = gt_d return gt
def __init__(self, num_type: int, kernel, parameter_set: Dict): """ Initialize endogenous impact: phi_{kk'}(t) = sum_{m} a_{kk'm} kernel_m(t), for m = 1, ..., M, A_m = [a_{kk'm}] in R^{C*C+1}, C is the number of event type :param num_type: for a point process with C types of events, num_type = C+1, in which the first type "0" corresponds to an "empty" type never appearing in the sequence. :param kernel: an instance of a decay kernel class in "DecayKernelFamily" :param parameter_set: a dictionary containing parameters parameter_set = {'activation': value = names of activation layers ('identity', 'relu', 'softplus') 'dim_feature': value = the dimension of feature vector (embedding)} """ super(FactorizedEndogenousImpact, self).__init__(num_type, kernel) activation = parameter_set['activation'] dim_embedding = parameter_set['dim_embedding'] if activation is None: self.endogenous_impact_type = "sum_m (u_{cm}^T * v_{c'm}) * kernel_m(t)" self.activation = 'identity' else: self.endogenous_impact_type = "sum_m {}(u_(cm)^T * v_(c'm)) * kernel_m(t))".format(activation) self.activation = activation self.decay_kernel = kernel self.num_base = self.decay_kernel.parameters.shape[1] self.num_type_u = num_type self.num_type_v = num_type self.dim_embedding = dim_embedding for m in range(self.num_base): emb_u = nn.Embedding(self.num_type_u, self.dim_embedding) emb_v = nn.Embedding(self.num_type_v, self.dim_embedding) emb_u.weight = nn.Parameter( torch.FloatTensor(self.num_type_u, self.dim_embedding).uniform_( 0.01 / self.dim_embedding, 1 / self.dim_embedding)) emb_v.weight = nn.Parameter( torch.FloatTensor(self.num_type_v, self.dim_embedding).uniform_( 0.01 / self.dim_embedding, 1 / self.dim_embedding)) if m == 0: self.basis_u = nn.ModuleList([emb_u]) self.basis_v = nn.ModuleList([emb_v]) else: self.basis_u.append(emb_u) self.basis_v.append(emb_v) if self.activation == 'relu': self.act = nn.ReLU() elif self.activation == 'softplus': self.act = nn.Softplus(beta=self.num_type**0.5) elif self.activation == 'identity': self.act = Identity() else: logger.warning('The actvation layer is {}, which can not be identified... '.format(self.activation)) logger.warning('Identity activation is applied instead.') self.act = Identity()
def load_model(self, full_path, mode: str='entire'): """ Load pre-trained model :param full_path: the path of directory :param mode: 'parameter' for saving only parameters of the model, 'entire' for saving entire model """ if mode == 'entire': self.lambda_model = torch.load(full_path) elif mode == 'parameter': self.lambda_model.load_state_dict(torch.load(full_path)) else: logger.warning("'{}' is a undefined mode, we use 'entire' mode instead.".format(mode)) self.lambda_model = torch.load(full_path)
def integrations(self, t_stop: torch.Tensor, t_start: Optional[torch.Tensor] = None) -> torch.Tensor: """ Calculate the integrations of decay kernel in the interval [t_start, t_stop] :param t_stop: a 2D Tensor containing stop timestamps :param t_start: a 2D Tensor containing start timestamps, if it is None, it means t_start = 0 :return: gt: a Tensor containing decay kernel's integration values in the interval [t_start, t_stop]. """ if t_start is None: t_start = 0 * t_stop if t_start.size() != t_stop.size(): logger.warning( f"The t_start does not have the same shape with t_stop, we set t_start to all zeros" ) t_start = 0 * t_stop delay = self.parameters[0, 0] bandwidth = self.parameters[1, 0] # condition 1 dt_start1 = copy.deepcopy(t_start) dt_start1[t_start > delay] = 0 gt_start1 = dt_start1 * (bandwidth - 1) / delay # condition 2 dt_start2 = copy.deepcopy(t_start) dt_start2[t_start <= delay] = delay gt_start2 = 1 - (delay / dt_start2)**(bandwidth - 1) + (bandwidth - 1) gt_start2[t_start <= delay] = 0 gt_start = gt_start1 + gt_start2 # condition 1 dt_stop1 = copy.deepcopy(t_stop) dt_stop1[t_stop > delay] = 0 gt_stop1 = dt_stop1 * (bandwidth - 1) / delay # condition 2 dt_stop2 = copy.deepcopy(t_stop) dt_stop2[t_stop <= delay] = delay gt_stop2 = 1 - (delay / dt_stop2)**(bandwidth - 1) + (bandwidth - 1) gt_stop2[t_stop <= delay] = 0 gt_stop = gt_stop1 + gt_stop2 gt_d = gt_stop - gt_start gt = gt_d.view(gt_d.size(0), gt_d.size(1), 1) # gt = np.zeros((gt_d.shape[0], gt_d.shape[1], 1)) # gt[:, :, 0] = gt_d return gt
def save_model(self, full_path, mode: str = 'entire'): """ Save trained model :param full_path: the path of directory :param mode: 'parameter' for saving only parameters of the model, 'entire' for saving entire model """ if mode == 'entire': torch.save(self.lambda_model, full_path) elif mode == 'parameter': torch.save(self.lambda_model.state_dict(), full_path) else: logger.warning( "'{}' is a undefined mode, we use 'entire' mode instead.". format(mode)) torch.save(self.lambda_model, full_path)
def __init__(self, database, memorysize: int = None): """ :param database: the observed event sequences database = {'event_features': None or (C, De) float array of event's static features, C is the number of event types. 'type2idx': a Dict = {'event_name': event_index} 'idx2type': a Dict = {event_index: 'event_name'} 'seq2idx': a Dict = {'seq_name': seq_index} 'idx2seq': a Dict = {seq_index: 'seq_name'} 'sequences': a List = {seq_1, seq_2, ..., seq_N}. } For the i-th sequence: seq_i = {'times': (N,) float array of timestamps, N is the number of events. 'events': (N,) int array of event types. 'seq_feature': None or (Ds,) float array of sequence's static feature. 't_start': a float number indicating the start timestamp of the sequence. 't_stop': a float number indicating the stop timestamp of the sequence. 'label': None or int number indicating the labels of the sequence} :param memorysize: how many historical events remembered by each event When memorysize = None All events in a sequence will be considered. In that case, each batch can only contain one sequence because different sequences may have different length. When memorysize = K We only memory the last K events of each sequence. For the sequence with <K events, we fill virtual event "0" to the beginning of the sequence. """ self.event_cell = [] self.time_cell = [] self.database = database self.memory_size = memorysize if self.memory_size is None: logger.warning( "Because memory size is not given, the sampler can only sample 1 sequence per batch." ) logger.warning("Please set batch size = 1 in your code.") for i in range(len(database['sequences'])): seq_i = database['sequences'][i] times = seq_i['times'] events = seq_i['events'] t_start = seq_i['t_start'] target = seq_i['label'] target_t = seq_i['t_stop'] if self.memory_size is None: former = events former_t = times else: # former = np.zeros((memorysize,), dtype=np.int) # former = np.random.permutation(len(self.database['type2idx'])) # former = former[:memorysize] former = np.random.choice(len(self.database['type2idx']), memorysize) former_t = t_start * np.ones((memorysize, )) if 0 < times.shape[0] < memorysize: former[-memorysize:] = events former_t[-memorysize:] = times else: former = events[-memorysize:] former_t = times[-memorysize:] self.event_cell.append((target, former, i)) self.time_cell.append((target_t, former_t)) logger.info('In this dataset, the number of sequences = {}.'.format( len(self.event_cell)))
def stitching(database1: Dict, database2: Dict, method: str = 'random') -> Dict: """ Stitch each sequence in database2 to the end of one sequence of database1 :param database1: the observed event sequences :param database2: another observed event sequences database = {'event_features': None or (De, C) float array of event's static features, C is the number of event types. 'type2idx': a Dict = {'event_name': event_index} 'idx2type': a Dict = {event_index: 'event_name'} 'seq2idx': a Dict = {'seq_name': seq_index} 'idx2seq': a Dict = {seq_index: 'seq_name'} 'sequences': a List = {seq_1, seq_2, ..., seq_N}. } For the i-th sequence: seq_i = {'times': (N,) float array of timestamps, N is the number of events. 'events': (N,) int array of event types. 'seq_feature': None or (Ds,) float array of sequence's static feature. 't_start': a float number indicating the start timestamp of the sequence. 't_stop': a float number indicating the stop timestamp of the sequence. 'label': None or int/float number indicating the labels of the sequence} :param method: a string indicates stitching method: "random": stitch the seq_j in sequences2 to the seq_i in sequences1 for j ~ {1,...,N}, i=1,...,N and time-shifting is applied to sequences2. This method is suitable for the sequences generated by a same stationary point process. "feature": stitch the seq_j in sequences2 to the seq_i in sequences1 for j ~{1,...,N}, i=1,...,N and j is sampled according to the similarity between two sequences. The similarity is calculated by the Gaussian kernel of seq_features, labels and times. When seq_features/labels are not available, only timestamp information are taken into account. :return: the output sequences are with the same format as database1. """ start = time.time() output = copy.deepcopy(database1) if database1['type2idx'] == database2['type2idx']: if method is None or method == 'random': logger.info('random stitching is applied...') index = np.random.permutation( len(database2['sequences'] )) # random permutation of the index of sequences for i in range(len(database1['sequences'])): seq_i = database1['sequences'][i] j = i % len(database2['sequences']) seq_j = database2['sequences'][index[j]] # concatenate two timestamp arrays with time shifting times1 = seq_i['times'] times2 = seq_j['times'] - seq_j['t_start'] + seq_i['t_stop'] output['sequences'][i]['times'] = np.concatenate( (times1, times2), axis=0) # concatenate two event arrays output['sequences'][i]['events'] = np.concatenate( (seq_i['events'], seq_j['events']), axis=0) # update stop timestamp output['sequences'][i]['t_stop'] = seq_i['t_stop'] + seq_j[ 't_stop'] - seq_j['t_start'] # update features if seq_i['seq_feature'] is not None and seq_j[ 'seq_feature'] is not None: output['sequences'][i]['seq_feature'] = ( seq_i['seq_feature'] + seq_j['seq_feature']) / 2 if i % 1000 == 0: logger.info( '{} sequences have been stitched... Time={}ms.'.format( i, round(1000 * (time.time() - start)))) elif method == 'feature': logger.info('feature-based stitching is applied...') for i in range(len(database1['sequences'])): prob = np.zeros((len(database2['sequences']), )) seq_i = database1['sequences'][i] for j in range(len(database2['sequences'])): seq_j = database2['sequences'][j] if seq_j['t_start'] > seq_i['t_stop']: # consider temporal order weight = np.exp(-(seq_j['t_start'] - seq_i['t_stop'])**2) # consider feature similarity if seq_i['seq_feature'] is not None and seq_j[ 'seq_feature'] is not None: weight *= np.exp( -np.linalg.norm(seq_i['seq_feature'] - seq_j['seq_feature'])**2) # consider label consistency if seq_i['label'] is not None and seq_j[ 'label'] is not None: if seq_i['label'] != seq_j['label']: weight = 0 else: weight = 0 prob[j] = weight # sampling a sequence from database2 if np.sum(prob) > 0: prob = prob / np.sum(prob) else: prob = np.ones((len(database2['sequences']), )) / len( database2['sequences']) j = np.random.choice(len(database2['sequences']), p=prob) seq_j = database2['sequences'][j] # concatenate two timestamp arrays with time shifting times1 = seq_i['times'] times2 = seq_j['times'] - seq_j['t_start'] + seq_i['t_stop'] output['sequences'][i]['times'] = np.concatenate( (times1, times2), axis=0) # concatenate two event arrays output['sequences'][i]['events'] = np.concatenate( (seq_i['events'], seq_j['events']), axis=0) # update stop timestamp output['sequences'][i]['t_stop'] = seq_i['t_stop'] + seq_j[ 't_stop'] - seq_j['t_start'] # update features if seq_i['seq_feature'] is not None and seq_j[ 'seq_feature'] is not None: output['sequences'][i]['seq_feature'] = ( seq_i['seq_feature'] + seq_j['seq_feature']) / 2 if i % 1000 == 0: logger.info( '{} sequences have been stitched... Time={}ms.'.format( i, round(1000 * (time.time() - start)))) else: logger.warning('You need to define your own stitching method... ' 'The function returns the first database.') else: logger.warning('The two databases do not have the same event types... ' 'The function returns the first database.') return output
def load_seq_labels_csv(file_name: str, seq_domain: str, domain_dict: Dict, database: Dict): """ load sequences' features from a csv file :param file_name: the path and the name of the csv file :param seq_domain: the name of the key column corresponding to sequence index. :param domain_dict: a dictionary containing the name of the key column corresponding to the labels. The format should be domain_dict = {'domain_name': domain's feature type} The dictionary should only contain one key. If multiple keys are provided, only the first one is considered. Two types are considered: 1) 'numerical': each element (row) in the corresponding domain should be a string containing D numbers separated by spaces, and D should be the same for various elements. D-dimensional real-value labels will be generated for this domain. If each sequence has multiple rows, the average of the labels will be recorded. 2) 'categorical': each element (row) in the corresponding domain should be a strong containing N keywords. N-dimensional categorical label will be generated for this domain. If each sequence has multiple rows, the aggregation of the categories will be recorded. :param database: a dictionary of data generated by the function "load_sequences_csv()" :return: a database having sequences' labels """ df = pd.read_csv(file_name) num_seq = len(database['seq2idx']) # initialize features keys = list(domain_dict.keys()) label_domain = keys[0] if len(keys) > 1: logger.warning( "{} label domains are found. Only the first domain '{}' is used to generate labels." .format(len(keys), label_domain)) features = {} counts = {} features[label_domain] = None logger.info('Start to generate sequence labels...') start = time.time() for i, row in df.iterrows(): seq_name = str(row[seq_domain]) if seq_name not in database['seq2idx'].keys(): logger.warning( "'{}' is a new sequence not appearing in current database.". format(seq_name)) logger.warning("It will be ignored in the process.") else: seq_idx = database['seq2idx'][seq_name] elements = str(row[label_domain]) if domain_dict[label_domain] == 'numerical': elements = np.asarray(list(map(float, elements.split()))) dim = elements.shape[0] if features[label_domain] is None: features[label_domain] = np.zeros((dim, num_seq)) features[label_domain][:, seq_idx] = elements counts[label_domain] = np.zeros((1, num_seq)) counts[label_domain][0, seq_idx] = 1 else: features[label_domain][:, seq_idx] += elements counts[label_domain][0, seq_idx] += 1 elif domain_dict[label_domain] == 'categorical': elements = elements.split() if features[label_domain] is None: features[label_domain] = {} features[label_domain][seq_idx] = elements counts[label_domain] = {} element_idx = 0 else: if seq_idx not in features[label_domain].keys(): features[label_domain][seq_idx] = elements else: features[label_domain][seq_idx].extend(elements) for element in elements: if element not in counts[label_domain].keys(): counts[label_domain][element] = element_idx element_idx += 1 else: logger.warning( 'Undefined feature type for the domain {}.'.format( label_domain)) logger.warning("It will be ignored in the process.") if i % 1000 == 0: logger.info('{} rows have been processed... Time={}ms.'.format( i, round(1000 * (time.time() - start)))) # post-process of features start = time.time() if domain_dict[label_domain] == 'numerical': features_tmp = features[label_domain] features_tmp = features_tmp / np.tile( counts[label_domain], (features[label_domain].shape[0], 1)) for seq_idx in range(features_tmp.shape[1]): database['sequences'][seq_idx]['label'] = features_tmp[:, seq_idx] elif domain_dict[label_domain] == 'categorical': for seq_idx in features[label_domain].keys(): elements = list(set(features[label_domain][seq_idx])) feature_tmp = [] for element in elements: element_idx = counts[label_domain][element] feature_tmp.append(element_idx) feature_tmp = np.asarray(feature_tmp, dtype=np.int) database['sequences'][seq_idx]['label'] = feature_tmp else: logger.warning('Undefined label type for the domain {}.'.format( domain_dict[label_domain])) logger.warning("It will be ignored in the process.") logger.info("Labels of domain '{}' is generated... Time={}ms.".format( domain_dict[label_domain], round(1000 * (time.time() - start)))) return database
def load_event_features_csv(file_name: str, event_domain: str, domain_dict: Dict, database: Dict, normalize: int = 0): """ load events' features from a csv file :param file_name: the path and the name of the csv file :param event_domain: the name of the key column corresponding to event index. :param domain_dict: a dictionary containing the names of the key columns corresponding to the features. The format should be domain_dict = {'domain_name': domain's feature type} Two types are considered: 1) 'numerical': each element (row) in the corresponding domain should be a string containing D numbers separated by spaces, and D should be the same for various elements. D-dimensional real-value features will be generated for this domain. If each event type has multiple rows, the average of the features will be recorded. 2) 'categorical': each element (row) in the corresponding domain should be a strong containing N keywords separated by spaces, but N can be different for various elements. D-dimensional binary features will be generated for this domain. Here D is the number of distinguished keywords (vocabulary size). If each event type has multiple rows, the aggregation of the binary features will be recorded. :param database: a dictionary of data generated by the function "load_sequences_csv()" :param normalize: 0 = no normalization, 1 = normalization across features, 2 = normalization across event types :return: a database having events' features """ df = pd.read_csv(file_name) num_event = len(database['type2idx']) # initialize features features = {} counts = {} for key in domain_dict.keys(): features[key] = None counts[key] = None logger.info('Start to generate sequence features...') start = time.time() for i, row in df.iterrows(): event_name = str(row[event_domain]) if event_name not in database['type2idx'].keys(): logger.warning( "'{}' is a new event type not appearing in current database.". format(event_name)) logger.warning("It will be ignored in the process.") else: event_idx = database['type2idx'][event_name] for key in domain_dict.keys(): elements = str(row[key]) if domain_dict[key] == 'numerical': elements = np.asarray(list(map(float, elements.split()))) dim = elements.shape[0] if features[key] is None: features[key] = np.zeros((dim, num_event)) features[key][:, event_idx] = elements counts[key] = np.zeros((1, num_event)) counts[key][0, event_idx] = 1 counts[key][0, 0] = 1 else: features[key][:, event_idx] += elements counts[key][0, event_idx] += 1 elif domain_dict[key] == 'categorical': elements = elements.split() if features[key] is None: features[key] = {} features[key][event_idx] = elements counts[key] = {} element_idx = 0 else: if event_idx not in features[key].keys(): features[key][event_idx] = elements else: features[key][event_idx].extend(elements) for element in elements: if element not in counts[key].keys(): counts[key][element] = element_idx element_idx += 1 else: logger.warning( 'Undefined feature type for the domain {}.'.format( key)) logger.warning("It will be ignored in the process.") if i % 1000 == 0: logger.info('{} rows have been processed... Time={}ms'.format( i, round(1000 * (time.time() - start)))) # post-process of features features_all = None start = time.time() for key in domain_dict.keys(): if domain_dict[key] == 'numerical': features_tmp = features[key] features_tmp = features_tmp / np.tile(counts[key], (features[key].shape[0], 1)) if features_all is None: features_all = features_tmp else: features_all = np.concatenate((features_all, features_tmp), axis=0) elif domain_dict[key] == 'categorical': features_tmp = np.zeros((len(counts[key]), num_event)) for event_idx in features[key].keys(): for element in features[key][event_idx]: element_idx = counts[key][element] features_tmp[element_idx, event_idx] += 1 if features_all is None: features_all = features_tmp else: features_all = np.concatenate((features_all, features_tmp), axis=0) else: logger.warning( 'Undefined feature type for the domain {}.'.format(key)) logger.warning("It will be ignored in the process.") logger.info( "features of domain '{}' is generated... Time={}ms.".format( key, round(1000 * (time.time() - start)))) if normalize == 1: features_all = features_all / \ np.tile(np.sum(features_all, axis=0)+1e-8, (features_all.shape[0], 1)) if normalize == 2: features_all = features_all / \ np.transpose(np.tile(np.sum(features_all, axis=1)+1e-8, (features_all.shape[1], 1))) database['event_features'] = features_all return database