Beispiel #1
0
def csv_extracting(log_file, col_header_file, label_file, dataset='HDFS'):
    '''
    :param log_file: txt/log
    :param col_header_file: txt
    :param label_file: csv
    :return: None
    '''
    assert log_file.endswith('.txt') or log_file.endswith('.log')
    if dataset == 'HDFS':
        print("== STEP 1 ==")
        dataloader.load_HDFS(log_file, col_header_file, label_file)
    else:
        print("== STEP 1 ==")
        _load_openstack(log_file, col_header_file)
Beispiel #2
0
        model.evaluate(): evaluate model accuracy with labeled data
'''

import sys
sys.path.append('../')
from loglizer.models import InvariantsMiner
from loglizer import dataloader, preprocessing

struct_log = '../data/HDFS/HDFS_100k.log_structured.csv'  # The structured log file
label_file = '../data/HDFS/anomaly_label.csv'  # The anomaly label file
epsilon = 0.5  # threshold for estimating invariant space

if __name__ == '__main__':
    # Load structured log without label info
    train_test_tuple = dataloader.load_HDFS(struct_log,
                                            window='session',
                                            train_ratio=0.5,
                                            split_type='sequential')
    (x_train, _), (x_test, _) = train_test_tuple[0], train_test_tuple[1]
    # Feature extraction
    feature_extractor = preprocessing.FeatureExtractor()
    x_train = feature_extractor.fit_transform(x_train)

    # Model initialization and training
    model = InvariantsMiner(epsilon=epsilon)
    model.fit(x_train)

    # Predict anomalies on the training set offline, and manually check for correctness
    y_train = model.predict(x_train)

    # Predict anomalies on the test set to simulate the online mode
    # x_test may be loaded from another log file
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
sys.path.append('../')
from loglizer.models import PCA
from loglizer import dataloader, preprocessing

struct_log = '../data/HDFS/HDFS_100k.log_structured.csv'  # The structured log file

if __name__ == '__main__':
    ## 1. Load strutured log file and extract feature vectors
    # Save the raw event sequence file by setting save_csv=True
    (x_train, _), (_, _) = dataloader.load_HDFS(struct_log,
                                                window='session',
                                                save_csv=True)
    feature_extractor = preprocessing.FeatureExtractor()
    x_train = feature_extractor.fit_transform(x_train,
                                              term_weighting='tf-idf',
                                              normalization='zero-mean')

    ## 2. Train an unsupervised model
    print('Train phase:')
    # Initialize PCA, or other unsupervised models, LogClustering, InvariantsMiner
    model = PCA()
    # Model hyper-parameters may be sensitive to log data, here we use the default for demo
    model.fit(x_train)
    # Make predictions and manually check for correctness. Details may need to go into the raw logs
    y_train = model.predict(x_train)

    ## 3. Use the trained model for online anomaly detection
Beispiel #4
0
                                  (precision + recall)))


if __name__ == '__main__':
    dataset = FLAGS.dataset
    print('########### Start CNN on Dataset ' + dataset + ' ###########')
    config.init('CNN_' + dataset)
    train_dir = config.path + FLAGS.train_dir

    if dataset == 'HDFS':
        data_instances = config.HDFS_data
        (x_train, y_train), (x_test,
                             y_test), (x_validate,
                                       y_validate) = dataloader.load_HDFS(
                                           data_instances,
                                           train_ratio=0.3,
                                           is_data_instance=True,
                                           test_ratio=0.6,
                                           CNN_option=True)
    elif dataset == 'BGL':
        data_instances = config.BGL_data

        (x_train,
         y_train), (x_test,
                    y_test), (x_validate,
                              y_validate) = load_BGL(data_instances, 0.35, 0.6)

    cnn_preprocessor = preprocessing.CNNPreprocessor(FLAGS.log_len, x_train,
                                                     x_test, x_validate)
    sym_count = len(cnn_preprocessor.syms) - 1
    print('Total symbols: %d' % sym_count)
Beispiel #5
0
import sys

sys.path.append('../')
from loglizer.models import InvariantsMiner
from loglizer import dataloader, preprocessing

# From https://stackoverflow.com/questions/132058/showing-the-stack-trace-from-a-running-python-application#133384
struct_log = '../data/August.csv'  # The structured log file
label_file = None  # The anomaly label file
epsilon = 0.5  # threshold for estimating invariant space

if __name__ == '__main__':
    # Load structured log without label info
    (x_train, _), (x_test, _) = dataloader.load_HDFS(struct_log,
                                                     window='session',
                                                     train_ratio=1.0,
                                                     split_type='sequential')
    # Feature extraction
    feature_extractor = preprocessing.FeatureExtractor()
    x_train = feature_extractor.fit_transform(x_train)

    # Model initialization and training
    model = InvariantsMiner(epsilon=epsilon)
    model.fit(x_train)

    # Predict anomalies on the training set offline, and manually check for correctness
    y_train = model.predict(x_train)

    # Predict anomalies on the test set to simulate the online mode
    # x_test may be loaded from another log file
    x_test = feature_extractor.transform(x_test)
Beispiel #6
0
# -*- coding: utf-8 -*-

import sys

sys.path.append('../')
from loglizer.models import PCA
from loglizer import dataloader, preprocessing

struct_log = '../data/HDFS/HDFS_100k.log_structured.csv'  # The structured log file
label_file = '../data/HDFS/anomaly_label.csv'  # The anomaly label file

if __name__ == '__main__':
    (x_train, y_train), (x_test,
                         y_test) = dataloader.load_HDFS(struct_log,
                                                        label_file=label_file,
                                                        window='session',
                                                        train_ratio=0.5,
                                                        split_type='uniform')
    feature_extractor = preprocessing.FeatureExtractor()
    x_train = feature_extractor.fit_transform(x_train,
                                              term_weighting='tf-idf',
                                              normalization='zero-mean')
    x_test = feature_extractor.transform(x_test)

    model = PCA()
    model.fit(x_train)

    print('Train validation:')
    precision, recall, f1 = model.evaluate(x_train, y_train)

    print('Test validation:')
Beispiel #7
0
        model.evaluate(): evaluate model accuracy with labeled data
'''

import sys

sys.path.append('../')
from loglizer.models import PCA
from loglizer import dataloader, preprocessing

struct_log = '../data/HDFS/HDFS_100k.log_structured.csv'  # The structured log file

if __name__ == '__main__':
    ## 1. Load strutured log file and extract feature vectors
    # Save the raw event sequence file by setting save_csv=True
    (x_train, _), (_, _) = dataloader.load_HDFS(struct_log,
                                                window='session',
                                                split_type='sequential',
                                                save_csv=True)
    feature_extractor = preprocessing.FeatureExtractor()
    x_train = feature_extractor.fit_transform(x_train,
                                              term_weighting='tf-idf',
                                              normalization='zero-mean')

    ## 2. Train an unsupervised model
    print('Train phase:')
    # Initialize PCA, or other unsupervised models, LogClustering, InvariantsMiner
    model = PCA()
    # Model hyper-parameters may be sensitive to log data, here we use the default for demo
    model.fit(x_train)
    # Make predictions and manually check for correctness. Details may need to go into the raw logs
    y_train = model.predict(x_train)
Beispiel #8
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
sys.path.append('../')
from loglizer.models import LR
from loglizer import dataloader, preprocessing

struct_log = '../data/HDFS/HDFS_100k.log_structured.csv'  # The structured log file
label_file = '../data/HDFS/anomaly_label.csv'  # The anomaly label file

if __name__ == '__main__':
    (x_train, y_train), (x_test,
                         y_test) = dataloader.load_HDFS(struct_log,
                                                        label_file=label_file,
                                                        window='session',
                                                        train_ratio=0.5)

    feature_extractor = preprocessing.FeatureExtractor()
    x_train = feature_extractor.fit_transform(x_train, term_weighting='tf-idf')
    x_test = feature_extractor.transform(x_test)

    model = LR()
    model.fit(x_train, y_train)

    print('Train validation:')
    precision, recall, f1 = model.evaluate(x_train, y_train)

    print('Test validation:')
    precision, recall, f1 = model.evaluate(x_test, y_test)
Beispiel #9
0
# -*- coding: utf-8 -*-

import sys
sys.path.append('../')
import pandas as pd
from loglizer.models import *
from loglizer import dataloader, preprocessing

run_models = ['PCA', 'InvariantsMiner', 'LogClustering', 'IsolationForest', 'LR', 
              'SVM', 'DecisionTree']
struct_log = '../data/HDFS/HDFS.npz' # The benchmark dataset

if __name__ == '__main__':

    (x_tr, y_train), (x_te, y_test) = dataloader.load_HDFS(struct_log,
                                                           window='session', 
                                                           train_ratio=0.5,
                                                           split_type='uniform')
    benchmark_results = []
    for _model in run_models:
        print('Evaluating {} on HDFS:'.format(_model))
        if _model == 'PCA':
            feature_extractor = preprocessing.FeatureExtractor()
            x_train = feature_extractor.fit_transform(x_tr, term_weighting='tf-idf', 
                                                      normalization='zero-mean')
            model = PCA()
            model.fit(x_train)
        
        elif _model == 'InvariantsMiner':
            feature_extractor = preprocessing.FeatureExtractor()
            x_train = feature_extractor.fit_transform(x_tr)
            model = InvariantsMiner(epsilon=0.5)
struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file
label_file = '../data/HDFS/anomaly_label.csv' # The anomaly label file

if __name__ == '__main__':

    Eventname=['E5', 'E22', 'E11', 'E9', 'E26', 'E3', 'E4', 'E2', 'E23', 'E21', 'E20',
       'E25', 'E18', 'E6']
    playerlist=[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
    #playerlist=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51]
    coalition=[]
    coalition=x_ai.getcoaltionlist()
    cvalue=[]
    print("coalition number:" ,len(coalition))


    (x_train, window_y_train, y_train), (x_test, window_y_test, y_test) = dataloader.load_HDFS(struct_log, label_file=label_file, window='session', window_size=window_size, train_ratio=train_ratio, split_type='uniform')
    
    feature_extractor = Vectorizer()
    train_dataset = feature_extractor.fit_transform(x_train, window_y_train, y_train)
    test_dataset = feature_extractor.transform(x_test, window_y_test, y_test)

    train_loader = Iterator(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers).iter
    test_loader = Iterator(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers).iter

    model = DeepLog(num_labels=feature_extractor.num_labels, hidden_size=hidden_size, num_directions=num_directions, topk=topk, device=device)
    model.fit(train_loader, epoches)

    print('Train validation:')
    metrics = model.evaluate(train_loader)
    print(metrics)
Beispiel #11
0
    collector = None
    if dataset == 'BGL':
        data_instances = config.BGL_data

        (x_train,
         y_train), (x_test,
                    y_test), (x_validate,
                              y_validate) = load_BGL(data_instances, 0.35, 0.6)

    if dataset == 'HDFS':
        data_instances = config.HDFS_data
        (x_train, y_train), (x_test,
                             y_test), (x_validate,
                                       y_validate) = dataloader.load_HDFS(
                                           data_instances,
                                           train_ratio=0.35,
                                           is_data_instance=True,
                                           test_ratio=0.6)
        result_folder = config.path + FLAGS.result_folder
        collector = Collector(result_folder, (1, 1, 1, 1), False,
                              config.HDFS_col_header, 100)

    assert FLAGS.h < FLAGS.plb
    lstm_preprocessor = preprocessing.LSTMPreprocessor(x_train, x_test,
                                                       x_validate)
    sym_count = len(lstm_preprocessor.vectors) - 1
    print('Total symbols: %d' % sym_count)
    print(lstm_preprocessor.syms)

    # pad x_train
    x_train = [
        model.predict(): predict anomalies on given data
        model.evaluate(): evaluate model accuracy with labeled data
'''

import sys
sys.path.append('../')
from loglizer.models import PCA
from loglizer import dataloader, preprocessing

struct_log = '../data/HDFS/HDFS_100k.log_structured.csv'  # The structured log file

if __name__ == '__main__':
    ## 1. Load strutured log file and extract feature vectors
    # Save the raw event sequence file by setting save_csv=True
    train_test_tuple = dataloader.load_HDFS(struct_log,
                                            window='session',
                                            split_type='sequential',
                                            save_csv=True)
    (x_train, y_train), (x_test,
                         y_test) = train_test_tuple[0], train_test_tuple[1]
    feature_extractor = preprocessing.FeatureExtractor()
    x_train = feature_extractor.fit_transform(x_train,
                                              term_weighting='tf-idf',
                                              normalization='zero-mean')

    ## 2. Train an unsupervised model
    print('Train phase:')
    # Initialize PCA, or other unsupervised models, LogClustering, InvariantsMiner
    model = PCA()
    # Model hyper-parameters may be sensitive to log data, here we use the default for demo
    model.fit(x_train)
    # Make predictions and manually check for correctness. Details may need to go into the raw logs
        model.evaluate(): evaluate model accuracy with labeled data
'''

import sys
sys.path.append('../')
from loglizer.models import InvariantsMiner
from loglizer import dataloader, preprocessing

struct_log = '../data/HDFS/HDFS_100k.log_structured.csv'  # The structured log file
label_file = '../data/HDFS/anomaly_label.csv'  # The anomaly label file
epsilon = 0.5  # threshold for estimating invariant space

if __name__ == '__main__':
    # Load structured log without label info
    (x_train, _), (x_test, _) = dataloader.load_HDFS(struct_log,
                                                     window='session',
                                                     train_ratio=0.5,
                                                     split_type='sequential')
    # Feature extraction
    feature_extractor = preprocessing.FeatureExtractor()
    x_train = feature_extractor.fit_transform(x_train)

    # Model initialization and training
    model = InvariantsMiner(epsilon=epsilon)
    model.fit(x_train)

    # Predict anomalies on the training set offline, and manually check for correctness
    y_train = model.predict(x_train)

    # Predict anomalies on the test set to simulate the online mode
    # x_test may be loaded from another log file
    x_test = feature_extractor.transform(x_test)
Beispiel #14
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
sys.path.append('../')
import pandas as pd
from loglizer.models import *
from loglizer import dataloader, preprocessing

run_models = [
    'LR', 'SVM', 'DecisionTree', 'PCA', 'InvariantsMiner', 'LogClustering'
]
struct_log = '../data/HDFS/HDFS.npz'  # The benchmark dataset

if __name__ == '__main__':
    (x_tr, y_train), (x_te, y_test) = dataloader.load_HDFS(struct_log,
                                                           train_ratio=0.5)
    benchmark_results = []
    for _model in run_models:
        print('Evaluating {} on HDFS:'.format(_model))
        if _model == 'PCA':
            feature_extractor = preprocessing.FeatureExtractor()
            x_train = feature_extractor.fit_transform(
                x_tr, term_weighting='tf-idf', normalization='zero-mean')
            model = PCA()
            model.fit(x_train)

        elif _model == 'InvariantsMiner':
            feature_extractor = preprocessing.FeatureExtractor()
            x_train = feature_extractor.fit_transform(x_tr)
            model = InvariantsMiner(epsilon=0.5)
            model.fit(x_train)