def csv_extracting(log_file, col_header_file, label_file, dataset='HDFS'): ''' :param log_file: txt/log :param col_header_file: txt :param label_file: csv :return: None ''' assert log_file.endswith('.txt') or log_file.endswith('.log') if dataset == 'HDFS': print("== STEP 1 ==") dataloader.load_HDFS(log_file, col_header_file, label_file) else: print("== STEP 1 ==") _load_openstack(log_file, col_header_file)
model.evaluate(): evaluate model accuracy with labeled data ''' import sys sys.path.append('../') from loglizer.models import InvariantsMiner from loglizer import dataloader, preprocessing struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file label_file = '../data/HDFS/anomaly_label.csv' # The anomaly label file epsilon = 0.5 # threshold for estimating invariant space if __name__ == '__main__': # Load structured log without label info train_test_tuple = dataloader.load_HDFS(struct_log, window='session', train_ratio=0.5, split_type='sequential') (x_train, _), (x_test, _) = train_test_tuple[0], train_test_tuple[1] # Feature extraction feature_extractor = preprocessing.FeatureExtractor() x_train = feature_extractor.fit_transform(x_train) # Model initialization and training model = InvariantsMiner(epsilon=epsilon) model.fit(x_train) # Predict anomalies on the training set offline, and manually check for correctness y_train = model.predict(x_train) # Predict anomalies on the test set to simulate the online mode # x_test may be loaded from another log file
#!/usr/bin/env python # -*- coding: utf-8 -*- import sys sys.path.append('../') from loglizer.models import PCA from loglizer import dataloader, preprocessing struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file if __name__ == '__main__': ## 1. Load strutured log file and extract feature vectors # Save the raw event sequence file by setting save_csv=True (x_train, _), (_, _) = dataloader.load_HDFS(struct_log, window='session', save_csv=True) feature_extractor = preprocessing.FeatureExtractor() x_train = feature_extractor.fit_transform(x_train, term_weighting='tf-idf', normalization='zero-mean') ## 2. Train an unsupervised model print('Train phase:') # Initialize PCA, or other unsupervised models, LogClustering, InvariantsMiner model = PCA() # Model hyper-parameters may be sensitive to log data, here we use the default for demo model.fit(x_train) # Make predictions and manually check for correctness. Details may need to go into the raw logs y_train = model.predict(x_train) ## 3. Use the trained model for online anomaly detection
(precision + recall))) if __name__ == '__main__': dataset = FLAGS.dataset print('########### Start CNN on Dataset ' + dataset + ' ###########') config.init('CNN_' + dataset) train_dir = config.path + FLAGS.train_dir if dataset == 'HDFS': data_instances = config.HDFS_data (x_train, y_train), (x_test, y_test), (x_validate, y_validate) = dataloader.load_HDFS( data_instances, train_ratio=0.3, is_data_instance=True, test_ratio=0.6, CNN_option=True) elif dataset == 'BGL': data_instances = config.BGL_data (x_train, y_train), (x_test, y_test), (x_validate, y_validate) = load_BGL(data_instances, 0.35, 0.6) cnn_preprocessor = preprocessing.CNNPreprocessor(FLAGS.log_len, x_train, x_test, x_validate) sym_count = len(cnn_preprocessor.syms) - 1 print('Total symbols: %d' % sym_count)
import sys sys.path.append('../') from loglizer.models import InvariantsMiner from loglizer import dataloader, preprocessing # From https://stackoverflow.com/questions/132058/showing-the-stack-trace-from-a-running-python-application#133384 struct_log = '../data/August.csv' # The structured log file label_file = None # The anomaly label file epsilon = 0.5 # threshold for estimating invariant space if __name__ == '__main__': # Load structured log without label info (x_train, _), (x_test, _) = dataloader.load_HDFS(struct_log, window='session', train_ratio=1.0, split_type='sequential') # Feature extraction feature_extractor = preprocessing.FeatureExtractor() x_train = feature_extractor.fit_transform(x_train) # Model initialization and training model = InvariantsMiner(epsilon=epsilon) model.fit(x_train) # Predict anomalies on the training set offline, and manually check for correctness y_train = model.predict(x_train) # Predict anomalies on the test set to simulate the online mode # x_test may be loaded from another log file x_test = feature_extractor.transform(x_test)
# -*- coding: utf-8 -*- import sys sys.path.append('../') from loglizer.models import PCA from loglizer import dataloader, preprocessing struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file label_file = '../data/HDFS/anomaly_label.csv' # The anomaly label file if __name__ == '__main__': (x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(struct_log, label_file=label_file, window='session', train_ratio=0.5, split_type='uniform') feature_extractor = preprocessing.FeatureExtractor() x_train = feature_extractor.fit_transform(x_train, term_weighting='tf-idf', normalization='zero-mean') x_test = feature_extractor.transform(x_test) model = PCA() model.fit(x_train) print('Train validation:') precision, recall, f1 = model.evaluate(x_train, y_train) print('Test validation:')
model.evaluate(): evaluate model accuracy with labeled data ''' import sys sys.path.append('../') from loglizer.models import PCA from loglizer import dataloader, preprocessing struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file if __name__ == '__main__': ## 1. Load strutured log file and extract feature vectors # Save the raw event sequence file by setting save_csv=True (x_train, _), (_, _) = dataloader.load_HDFS(struct_log, window='session', split_type='sequential', save_csv=True) feature_extractor = preprocessing.FeatureExtractor() x_train = feature_extractor.fit_transform(x_train, term_weighting='tf-idf', normalization='zero-mean') ## 2. Train an unsupervised model print('Train phase:') # Initialize PCA, or other unsupervised models, LogClustering, InvariantsMiner model = PCA() # Model hyper-parameters may be sensitive to log data, here we use the default for demo model.fit(x_train) # Make predictions and manually check for correctness. Details may need to go into the raw logs y_train = model.predict(x_train)
#!/usr/bin/env python # -*- coding: utf-8 -*- import sys sys.path.append('../') from loglizer.models import LR from loglizer import dataloader, preprocessing struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file label_file = '../data/HDFS/anomaly_label.csv' # The anomaly label file if __name__ == '__main__': (x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(struct_log, label_file=label_file, window='session', train_ratio=0.5) feature_extractor = preprocessing.FeatureExtractor() x_train = feature_extractor.fit_transform(x_train, term_weighting='tf-idf') x_test = feature_extractor.transform(x_test) model = LR() model.fit(x_train, y_train) print('Train validation:') precision, recall, f1 = model.evaluate(x_train, y_train) print('Test validation:') precision, recall, f1 = model.evaluate(x_test, y_test)
# -*- coding: utf-8 -*- import sys sys.path.append('../') import pandas as pd from loglizer.models import * from loglizer import dataloader, preprocessing run_models = ['PCA', 'InvariantsMiner', 'LogClustering', 'IsolationForest', 'LR', 'SVM', 'DecisionTree'] struct_log = '../data/HDFS/HDFS.npz' # The benchmark dataset if __name__ == '__main__': (x_tr, y_train), (x_te, y_test) = dataloader.load_HDFS(struct_log, window='session', train_ratio=0.5, split_type='uniform') benchmark_results = [] for _model in run_models: print('Evaluating {} on HDFS:'.format(_model)) if _model == 'PCA': feature_extractor = preprocessing.FeatureExtractor() x_train = feature_extractor.fit_transform(x_tr, term_weighting='tf-idf', normalization='zero-mean') model = PCA() model.fit(x_train) elif _model == 'InvariantsMiner': feature_extractor = preprocessing.FeatureExtractor() x_train = feature_extractor.fit_transform(x_tr) model = InvariantsMiner(epsilon=0.5)
struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file label_file = '../data/HDFS/anomaly_label.csv' # The anomaly label file if __name__ == '__main__': Eventname=['E5', 'E22', 'E11', 'E9', 'E26', 'E3', 'E4', 'E2', 'E23', 'E21', 'E20', 'E25', 'E18', 'E6'] playerlist=[0,1,2,3,4,5,6,7,8,9,10,11,12,13] #playerlist=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51] coalition=[] coalition=x_ai.getcoaltionlist() cvalue=[] print("coalition number:" ,len(coalition)) (x_train, window_y_train, y_train), (x_test, window_y_test, y_test) = dataloader.load_HDFS(struct_log, label_file=label_file, window='session', window_size=window_size, train_ratio=train_ratio, split_type='uniform') feature_extractor = Vectorizer() train_dataset = feature_extractor.fit_transform(x_train, window_y_train, y_train) test_dataset = feature_extractor.transform(x_test, window_y_test, y_test) train_loader = Iterator(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers).iter test_loader = Iterator(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers).iter model = DeepLog(num_labels=feature_extractor.num_labels, hidden_size=hidden_size, num_directions=num_directions, topk=topk, device=device) model.fit(train_loader, epoches) print('Train validation:') metrics = model.evaluate(train_loader) print(metrics)
collector = None if dataset == 'BGL': data_instances = config.BGL_data (x_train, y_train), (x_test, y_test), (x_validate, y_validate) = load_BGL(data_instances, 0.35, 0.6) if dataset == 'HDFS': data_instances = config.HDFS_data (x_train, y_train), (x_test, y_test), (x_validate, y_validate) = dataloader.load_HDFS( data_instances, train_ratio=0.35, is_data_instance=True, test_ratio=0.6) result_folder = config.path + FLAGS.result_folder collector = Collector(result_folder, (1, 1, 1, 1), False, config.HDFS_col_header, 100) assert FLAGS.h < FLAGS.plb lstm_preprocessor = preprocessing.LSTMPreprocessor(x_train, x_test, x_validate) sym_count = len(lstm_preprocessor.vectors) - 1 print('Total symbols: %d' % sym_count) print(lstm_preprocessor.syms) # pad x_train x_train = [
model.predict(): predict anomalies on given data model.evaluate(): evaluate model accuracy with labeled data ''' import sys sys.path.append('../') from loglizer.models import PCA from loglizer import dataloader, preprocessing struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file if __name__ == '__main__': ## 1. Load strutured log file and extract feature vectors # Save the raw event sequence file by setting save_csv=True train_test_tuple = dataloader.load_HDFS(struct_log, window='session', split_type='sequential', save_csv=True) (x_train, y_train), (x_test, y_test) = train_test_tuple[0], train_test_tuple[1] feature_extractor = preprocessing.FeatureExtractor() x_train = feature_extractor.fit_transform(x_train, term_weighting='tf-idf', normalization='zero-mean') ## 2. Train an unsupervised model print('Train phase:') # Initialize PCA, or other unsupervised models, LogClustering, InvariantsMiner model = PCA() # Model hyper-parameters may be sensitive to log data, here we use the default for demo model.fit(x_train) # Make predictions and manually check for correctness. Details may need to go into the raw logs
model.evaluate(): evaluate model accuracy with labeled data ''' import sys sys.path.append('../') from loglizer.models import InvariantsMiner from loglizer import dataloader, preprocessing struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file label_file = '../data/HDFS/anomaly_label.csv' # The anomaly label file epsilon = 0.5 # threshold for estimating invariant space if __name__ == '__main__': # Load structured log without label info (x_train, _), (x_test, _) = dataloader.load_HDFS(struct_log, window='session', train_ratio=0.5, split_type='sequential') # Feature extraction feature_extractor = preprocessing.FeatureExtractor() x_train = feature_extractor.fit_transform(x_train) # Model initialization and training model = InvariantsMiner(epsilon=epsilon) model.fit(x_train) # Predict anomalies on the training set offline, and manually check for correctness y_train = model.predict(x_train) # Predict anomalies on the test set to simulate the online mode # x_test may be loaded from another log file x_test = feature_extractor.transform(x_test)
#!/usr/bin/env python # -*- coding: utf-8 -*- import sys sys.path.append('../') import pandas as pd from loglizer.models import * from loglizer import dataloader, preprocessing run_models = [ 'LR', 'SVM', 'DecisionTree', 'PCA', 'InvariantsMiner', 'LogClustering' ] struct_log = '../data/HDFS/HDFS.npz' # The benchmark dataset if __name__ == '__main__': (x_tr, y_train), (x_te, y_test) = dataloader.load_HDFS(struct_log, train_ratio=0.5) benchmark_results = [] for _model in run_models: print('Evaluating {} on HDFS:'.format(_model)) if _model == 'PCA': feature_extractor = preprocessing.FeatureExtractor() x_train = feature_extractor.fit_transform( x_tr, term_weighting='tf-idf', normalization='zero-mean') model = PCA() model.fit(x_train) elif _model == 'InvariantsMiner': feature_extractor = preprocessing.FeatureExtractor() x_train = feature_extractor.fit_transform(x_tr) model = InvariantsMiner(epsilon=0.5) model.fit(x_train)