Esempio n. 1
0
    def mining_invariants(self, para):
        para['path'] = self.input_dir
        print(para)
        if self.data_type == 'time_based':
            para['log_file_name'] = self.log_seq
            raw_data, event_mapping_data = data_loader.deepia_data_loader(para)
            event_count_matrix = data_loader.deepia_preprocess_data(
                para, raw_data, event_mapping_data)
            r = mi.estimate_invar_spce(para, event_count_matrix)
            invar_dict = mi.invariant_search(para, event_count_matrix, r)
            log_template_path = self.input_dir + self.log_seq.split(
                '.log')[0] + '.log_templates.csv'
            structured_log_path = self.input_dir + self.log_seq.split(
                '.log')[0] + '.log_structured.csv'
            window_split_file_path = para['save_path'] + 'sliding_' + str(
                para['window_size']) + 'h_' + str(para['step_size']) + 'h.csv'
            predictions, anomalies = mi.deepia_evaluate(
                event_count_matrix, invar_dict, log_template_path,
                structured_log_path, window_split_file_path)
        elif self.data_type == 'time_based_bgl':
            para['log_file_name'] = self.log_seq
            raw_data, event_mapping_data = data_loader.bgl_data_loader(para)
            event_count_matrix, labels = data_loader.bgl_preprocess_data(
                para, raw_data, event_mapping_data)
            r = mi.estimate_invar_spce(para, event_count_matrix)
            invar_dict = mi.invariant_search(para, event_count_matrix, r)
            predictions = mi.evaluate(event_count_matrix, invar_dict, labels)
        elif self.data_type == 'event_based':
            para['log_seq_file_name'] = self.log_seq
            raw_data, label_data = data_loader.hdfs_data_loader(para)
            r = mi.estimate_invar_spce(para, raw_data)
            invar_dict = mi.invariant_search(para, raw_data, r)
            predictions = mi.evaluate(raw_data, invar_dict, label_data)

        return raw_data, event_mapping_data, event_count_matrix, r, invar_dict, predictions, anomalies
Esempio n. 2
0
    '../time_windows/',  # dir for saving sliding window data files to avoid splitting
    'select_column': [
        0, 4
    ],  # select the corresponding columns (label and time) in the raw log file
    'window_size': 3,  # time window (unit: hour)
    'step_size': 1,  # step size (unit: hour)
    'training_percent': 0.8,  # training data percentage
    'tf-idf': True,  # whether to use tf-idf
    'models': 'DT',  # select from ['DT', 'LR', 'SVM']
    'cross_validate':
    False  # set to True to avoid over_fitting (10-CV), but if we want to predict anomalies, it should set to False, Default: False
}

if __name__ == '__main__':
    model = para['models']
    assert model in ['DT', 'LR', 'SVM']
    raw_data, event_mapping_data = data_loader.bgl_data_loader(para)
    event_count_matrix, labels = data_loader.bgl_preprocess_data(
        para, raw_data, event_mapping_data)
    train_data, train_labels, testing_data, testing_labels = cl.data_split(
        para, event_count_matrix, labels)
    # Select one models out of three provided models
    if model == 'DT':
        cl.decision_tree(para, train_data, train_labels, testing_data,
                         testing_labels)
    elif model == 'LR':
        cl.logsitic_regression(para, train_data, train_labels, testing_data,
                               testing_labels)
    elif model == 'SVM':
        cl.SVM(para, train_data, train_labels, testing_data, testing_labels)
Esempio n. 3
0
para={
'path':'../../Data/BGL_data/', # directory for input data
'log_file_name':'BGL_MERGED.log', # filename for log data file
'log_event_mapping':'logTemplateMap.csv', # filename for log-event mapping. A list of event index, where each row represents a log
'save_path': '../time_windows/', # dir for saving sliding window data files to avoid splitting
'select_column':[0,4], # select the corresponding columns (label and time) in the raw log file
'window_size':3,  # time window (unit: hour)
'step_size': 1,  # step size (unit: hour)
'training_percent': 0.8,  # training data percentage
'tf-idf': True, # whether to use tf-idf
'models': 'DT', # select from ['DT', 'LR', 'SVM']
'cross_validate': False # set to True to avoid over_fitting (10-CV), but if we want to predict anomalies, it should set to False, Default: False
}


if __name__ == '__main__':
	model = para['models']
	assert model in ['DT', 'LR', 'SVM']
	raw_data, event_mapping_data = data_loader.bgl_data_loader(para)
	event_count_matrix, labels = data_loader.bgl_preprocess_data(para, raw_data, event_mapping_data)
	train_data, train_labels, testing_data, testing_labels = cl.data_split(para, event_count_matrix, labels)
	# Select one models out of three provided models
	if model == 'DT':
		cl.decision_tree(para, train_data, train_labels, testing_data, testing_labels)
	elif model == 'LR':
		cl.logsitic_regression(para, train_data, train_labels, testing_data, testing_labels)
	elif model == 'SVM':
		cl.SVM(para, train_data, train_labels, testing_data, testing_labels)