def daniel_first_results(): linux_total = np.fromfile( "/var/lib/arhuaco/data/performance/daniel_no_hard_native.log", dtype=float, sep="\n") docker_total = np.fromfile( "/var/lib/arhuaco/data/performance/daniel_no_hard_docker.log", dtype=float, sep="\n") rkt_total = np.fromfile( "/var/lib/arhuaco/data/performance/daniel_no_hard_rkt.log", dtype=float, sep="\n") singul_total = np.fromfile( "/var/lib/arhuaco/data/performance/daniel_no_hard_sing.log", dtype=float, sep="\n") # Graphically plot the results plot = Plot() # Linux job vs docker job vs docker+sysdig job plot.history2plot([linux_total, docker_total, rkt_total, singul_total], ['Linux', 'Docker', 'Rkt', 'Singularity'], "Stock Kernel", "Number of simultaneous Jobs", "Average runtime [s]", "/var/lib/arhuaco/data/performance/performance-%s.pdf" % time.strftime("%Y%m%d-%H%M%S"), 'lower right', [0.5, 10.5], [4200, 5800])
def my_results(): linux_total = np.fromfile( "/var/lib/arhuaco/data/performance/sysdig_total_final.log", dtype=float, sep="\n") docker_total = np.fromfile( "/var/lib/arhuaco/data/performance/docker_total_final.log", dtype=float, sep="\n") sysdig_total = np.fromfile( "/var/lib/arhuaco/data/performance/linux_total_final.log", dtype=float, sep="\n") # Graphically plot the results plot = Plot() # Linux job vs docker job vs docker+sysdig job plot.history2error([linux_total, docker_total, sysdig_total], [linux_total, docker_total, sysdig_total], ['Linux', 'Docker', 'Arhuaco isolation and monitoring'], "Performance test", "Number of ALICE grid jobs in parallel", "Average runtime [s]", "/var/lib/arhuaco/data/performance/performance-%s.pdf" % time.strftime("%Y%m%d-%H%M%S"), 'lower right', [1, 10], [0, 60000])
def analyze_syscalls(): # Parameters min_word_count = 5 # Minimum word count context = 10 # Context window size paths = [ "/var/lib/arhuaco/data/normal_clean.csv", "/var/lib/arhuaco/data/malicious_clean.csv" ] labels = [0, 1] number_samples = 2 number_samples_w2v = 10000 num_epochs = 10 embedding_dim = 10 # Model Hyperparameters max_length = 7 n_gram = 6 # Create objects data_helpers = DataHelpers(paths, labels, max_length, n_gram, number_samples) w2v = W2V() sentence_stream = data_helpers.sentence_stream(number_samples_w2v) params = w2v.train_word2vec_stream(sentence_stream, num_features=embedding_dim, min_word_count=min_word_count, context=context, num_epochs=num_epochs) # Create the model classes = np.array([0, 1]) clf = SGDClassifier(loss='hinge', penalty="l2", eta0=0.01, learning_rate='constant') # Data load data_generator = data_helpers.get_data_chunk(params[1]) # Training the model train_accuracy = [] test_accuracy = [] x_train, y_train = next(data_generator) clf.partial_fit(x_train, y_train, classes=classes) for batch in range(num_epochs): for sample in range(1000): x_train_aux, y_train_aux = next(data_generator) # x_test_aux, y_test_aux = next(data_generator) x_train = np.concatenate([x_train, x_train_aux], 0) y_train = np.concatenate([y_train, y_train_aux], 0) # x_test = np.concatenate([x_test, x_test_aux], 0) # y_test = np.concatenate([y_test, y_test_aux], 0) clf.partial_fit(x_train, y_train) print("Batch: %d" % batch) print('Train Accuracy: %.3f' % clf.score(x_train, y_train)) # print('Test Accuracy: %.3f' % clf.score(x_test, y_test)) train_accuracy.append(clf.score(x_train, y_train)) # test_accuracy.append(clf.score(x_test, y_test)) # Plot the results plot = Plot() plot.history2plot([train_accuracy], "Model accuracy", "Epoch", "Accuracy")
def training_vs_validation_cnn(): sys_accuracy = np.fromfile( "/var/lib/arhuaco/data/logs/sys_accuracy_cnn.log", dtype=float, sep="\n") sys_val_accuracy = np.fromfile( "/var/lib/arhuaco/data/logs/sys_val_accuracy_cnn.log", dtype=float, sep="\n") sys_fpr = np.fromfile("/var/lib/arhuaco/data/logs/sys_fpr_cnn.log", dtype=float, sep="\n") sys_val_fpr = np.fromfile("/var/lib/arhuaco/data/logs/sys_val_fpr_cnn.log", dtype=float, sep="\n") net_accuracy = np.fromfile( "/var/lib/arhuaco/data/logs/net_accuracy_cnn.log", dtype=float, sep="\n") net_val_accuracy = np.fromfile( "/var/lib/arhuaco/data/logs/net_val_accuracy_cnn.log", dtype=float, sep="\n") net_fpr = np.fromfile("/var/lib/arhuaco/data/logs/net_fpr_cnn.log", dtype=float, sep="\n") net_val_fpr = np.fromfile("/var/lib/arhuaco/data/logs/net_val_fpr_cnn.log", dtype=float, sep="\n") # Graphically plot the results plot = Plot() # Training vs validation plot.history2plot( [sys_accuracy, sys_val_accuracy], ['Training', 'Validation'], "System call classification with CNN", "Epoch", "Accuracy", "/var/lib/arhuaco/data/logs/sys_conv_accuracy-%s.pdf" % time.strftime("%Y%m%d-%H%M%S"), 'lower right', [0, 9], [0.8, 1.0]) plot.history2plot([sys_fpr, sys_val_fpr], ['Training', 'Validation'], "System call classification with CNN", "Epoch", "False positive rate", "/var/lib/arhuaco/data/logs/sys_conv_fpr-%s.pdf" % time.strftime("%Y%m%d-%H%M%S"), 'upper left', [0, 9], [0, 0.2]) plot.history2plot( [net_accuracy, net_val_accuracy], ['Training', 'Validation'], "Network trace classification with CNN", "Epoch", "Accuracy", "/var/lib/arhuaco/data/logs/net_conv_accuracy-%s.pdf" % time.strftime("%Y%m%d-%H%M%S"), 'lower right', [0, 9], [0.8, 1.0]) plot.history2plot([net_fpr, net_val_fpr], ['Training', 'Validation'], "Network trace classification with CNN", "Epoch", "False postive rate", "/var/lib/arhuaco/data/logs/net_conv_fpr-%s.pdf" % time.strftime("%Y%m%d-%H%M%S"), 'upper left', [0, 9], [0, 0.2])
def analyze_network(): clf = SGDClassifier(loss='hinge', penalty="l2") # Word2Vec parameters, see train_word2vec min_word_count = 1 # Minimum word count context = 4 # Context window size paths = [ "/var/lib/arhuaco/data/dns_normal.log", "/var/lib/arhuaco/data/dns_malicious.log" ] labels = [0, 1] number_samples = 10 num_epochs = 100 embedding_dim = 5 # Model Hyperparameters max_length = 5 n_gram = 1 # Create objects data_helpers = data_helpers = DataHelpers(paths, labels, max_length, n_gram, number_samples) w2v = W2V() # Load data print("Loading data...") x, y, vocabulary, vocabulary_inv = data_helpers.load_data() embedding_weights, vocabulary = w2v.train_word2vec(x, embedding_dim, min_word_count, context) classes = np.array([0, 1]) # Data load data_generator = data_helpers.get_data_chunk(vocabulary) # Training the model train_accuracy = [] train_loss = [] test_accuracy = [] test_loss = [] for batch in range(num_epochs): x_train, y_train = next(data_generator) x_test, y_test = next(data_generator) clf.partial_fit(x_train, y_train, classes=classes) print("Batch: %d" % batch) print('Train Accuracy: %.3f' % clf.score(x_train, y_train)) print('Test Accuracy: %.3f' % clf.score(x_test, y_test)) train_accuracy.append(clf.score(x_train, y_train)) test_accuracy.append(clf.score(x_test, y_test)) # Plot the results plot = Plot() plot.history2plot([train_accuracy, test_accuracy], "Model accuracy", "Epoch", "Accuracy")
def my_second_results(): # average linux_total_avg = np.fromfile( "/var/lib/arhuaco/data/performance/linux_avg_final.log", dtype=float, sep="\n") docker_total_avg = np.fromfile( "/var/lib/arhuaco/data/performance/docker_avg_final.log", dtype=float, sep="\n") sysdig_total_avg = np.fromfile( "/var/lib/arhuaco/data/performance/sysdig_avg_final.log", dtype=float, sep="\n") # standard deviation linux_total_std = np.fromfile( "/var/lib/arhuaco/data/performance/linux_std_final.log", dtype=float, sep="\n") docker_total_std = np.fromfile( "/var/lib/arhuaco/data/performance/docker_std_final.log", dtype=float, sep="\n") sysdig_total_std = np.fromfile( "/var/lib/arhuaco/data/performance/sysdig_std_final.log", dtype=float, sep="\n") # Graphically plot the results plot = Plot() # Linux job vs docker job vs docker+sysdig job plot.history2error([linux_total_avg, docker_total_avg, sysdig_total_avg], [linux_total_std, docker_total_std, sysdig_total_std], ['Linux', 'Docker', 'Arhuaco isolation and monitoring'], "Performance test", "Number of ALICE grid jobs in parallel", "Average runtime [s]", "/var/lib/arhuaco/data/performance/performance-%s.pdf" % time.strftime("%Y%m%d-%H%M%S"), 'lower right', [0.5, 10.5], [4200, 5800])
def comparative_results(): sys_val_accuracy_cnn = np.fromfile( "/var/lib/arhuaco/data/logs/sys_val_accuracy_cnn.log", dtype=float, sep="\n") sys_val_accuracy_svm = np.fromfile( "/var/lib/arhuaco/data/logs/sys_val_accuracy_svm.log", dtype=float, sep="\n") sys_val_fpr_cnn = np.fromfile( "/var/lib/arhuaco/data/logs/sys_val_fpr_cnn.log", dtype=float, sep="\n") sys_val_fpr_svm = np.fromfile( "/var/lib/arhuaco/data/logs/sys_val_fpr_svm.log", dtype=float, sep="\n") net_val_accuracy_cnn = np.fromfile( "/var/lib/arhuaco/data/logs/net_val_accuracy_cnn.log", dtype=float, sep="\n") net_val_accuracy_svm = np.fromfile( "/var/lib/arhuaco/data/logs/net_val_accuracy_svm.log", dtype=float, sep="\n") net_val_fpr_cnn = np.fromfile( "/var/lib/arhuaco/data/logs/net_val_fpr_cnn.log", dtype=float, sep="\n") net_val_fpr_svm = np.fromfile( "/var/lib/arhuaco/data/logs/net_val_fpr_svm.log", dtype=float, sep="\n") net_val_acc_gen_svm = np.fromfile( "/var/lib/arhuaco/data/logs/net_val_acc_gen_svm.log", dtype=float, sep="\n") # Graphically plot the results plot = Plot() # Syscall cnn vs svm acc plot.history2plot( [sys_val_accuracy_cnn[0:10], sys_val_accuracy_svm[0:10]], ['CNN validation', 'SVM validation'], "CNN vs SVM system call validation accuracy", "Epoch", "Accuracy", "/var/lib/arhuaco/data/logs/sys_cnn_svm_accuracy-%s.pdf" % time.strftime("%Y%m%d-%H%M%S"), 'lower right', [0, 9], [0, 0.2]) # Syscall cnn vs svm fpr plot.history2plot([sys_val_fpr_cnn[0:10], sys_val_fpr_svm[0:10]], ['CNN validation', 'SVM validation'], "CNN vs SVM system call validation false positive rate", "Epoch", "False positive rate", "/var/lib/arhuaco/data/logs/sys_cnn_svm_fpr-%s.pdf" % time.strftime("%Y%m%d-%H%M%S"), 'upper left', [0, 9], [0, 0.2]) # Network cnn vs svm acc plot.history2plot( [net_val_accuracy_cnn[0:10], net_val_accuracy_svm[0:10]], ['CNN validation', 'SVM validation'], "CNN vs SVM network trace validation accuracy", "Epoch", "Accuracy", "/var/lib/arhuaco/data/logs/net_cnn_svm_accuracy-%s.pdf" % time.strftime("%Y%m%d-%H%M%S"), 'lower right', [0, 9], [0, 0.2]) # Network cnn vs svm fpr plot.history2plot([net_val_fpr_cnn[0:10], net_val_fpr_svm[0:10]], ['CNN validation', 'SVM validation'], "CNN vs SVM network validation false positive rate", "Epoch", "False positive rate", "/var/lib/arhuaco/data/logs/net_cnn_svm_fpr-%s.pdf" % time.strftime("%Y%m%d-%H%M%S"), 'upper left', [0, 9], [0, 0.2]) # Network svm original vs svm generated acc plot.history2plot( [net_val_accuracy_svm[0:10], net_val_acc_gen_svm[0:10]], ['SVM validation non generated', 'SVM validation generated'], "SVM accuracy comparison: normal data vs generated data", "Epoch", "False positive rate", "/var/lib/arhuaco/data/logs/net_svm_accuracy-generated-%s.pdf" % time.strftime("%Y%m%d-%H%M%S"), 'upper left', [0, 9], [0, 0.2])
def analyze_syscalls(): # Parameters seed = 5 verbose = 2 # Model Hyperparameters # Max lenght of one sentence max_length = 7 # Number of lines included in the # series n_gram = 6 # Total lenght of the classification # object sequence_length = max_length * n_gram # Size of the vector representing each word embedding_dim = 20 dropout_prob = (0.0, 0.0) # Number of neurons in the hidden layer hidden_dims = 20 # Training parameters number_samples = 5 samples_per_epoch = 10000 num_epochs = 100 val_split = 0.1 # Word2Vec parameters, see train_word2vec # Minimum word count min_word_count = 6 # Number of words that make sense in the context context = 10 weights_file_svm = "/var/lib/arhuaco/data/models/sys_W_svm-%s"\ % time.strftime("%Y%m%d-%H%M%S") model_file_svm = "/var/lib/arhuaco/data/models/sys_model_svm-%s.json"\ % time.strftime("%Y%m%d-%H%M%S") # Training dataset paths = [ "/var/lib/arhuaco/data/normal_clean.csv", "/var/lib/arhuaco/data/malicious_clean.csv" ] # Training labels labels_svm = [-1, 1] # Create objects data_helpers = DataHelpers(paths, None, max_length, n_gram, number_samples, seed) w2v = W2V() sentence_stream = data_helpers.sentence_stream(samples_per_epoch) params = w2v.train_word2vec_stream(sentence_stream, num_features=embedding_dim, min_word_count=min_word_count, context=context, num_epochs=num_epochs) svm = SVM(seed, sequence_length, embedding_dim, dropout_prob, hidden_dims, number_samples, num_epochs, val_split, min_word_count, context, weights_file_svm, model_file_svm, paths, None, data_helpers, verbose) svm.get_data(params[0], params[1], params[2]) svm.build_model() print("SVM syscall training") history_svm = svm.train_model(samples_per_epoch, labels_svm) result = svm.test_model(10000, labels_svm, max_length, n_gram) # Graphically plot the results plot = Plot() # Training vs validation plot.history2plot([ history_svm.history['real_accuracy'], history_svm.history['val_real_accuracy'] ], ['Training', 'Validation'], "SVM accuracy", "Epoch", "Accuracy", "/var/lib/arhuaco/data/models/sys_svm_accuracy-%s.pdf" % time.strftime("%Y%m%d-%H%M%S"), location='lower right') # Trainning vs validation fpr plot.history2plot([history_svm.history['false_pos_rate'], history_svm.history['val_false_pos_rate']], ['Training', 'Validation'], "SVM false positive rate", "Epoch", "False positive rate", "/var/lib/arhuaco/data/models/sys_svm_fpr-%s.pdf"\ % time.strftime("%Y%m%d-%H%M%S"), location='upper right')
def train(self, type="syscall"): # Load configuration config_object = Configuration() if type == "syscall": config_object.load_configuration("host") configuration = config_object.default_config # Training parameters configuration['verbose'] = 2 configuration['samples_per_batch'] = 5 configuration['samples_per_epoch'] = 100000 configuration['num_epochs'] = 10 configuration['val_split'] = 0.1 configuration['weights_file_svm'] = "/var/lib/arhuaco/data/models/sys_W_svm-%s"\ % time.strftime("%Y%m%d-%H%M%S") configuration['model_file_svm'] = "/var/lib/arhuaco/data/models/sys_model_svm-%s.json"\ % time.strftime("%Y%m%d-%H%M%S") # Training dataset configuration['paths'] = [ "/var/lib/arhuaco/data/normal_clean_filtered.csv", "/var/lib/arhuaco/data/malicious_clean_filtered.csv" ] configuration['pdf_paths'] = ["/var/lib/arhuaco/data/models/sys_svm_accuracy-%s.pdf" % time.strftime("%Y%m%d-%H%M%S"), "/var/lib/arhuaco/data/models/sys_svm_fpr-%s.pdf"\ % time.strftime("%Y%m%d-%H%M%S")] elif type == "network": # Load configuration config_object = Configuration() config_object.load_configuration("network") configuration = config_object.default_config # Training parameters configuration['verbose'] = 2 configuration['samples_per_batch'] = 5 configuration['samples_per_epoch'] = 1000 configuration['num_epochs'] = 10 configuration['val_split'] = 0.1 configuration['weights_file_svm'] = "/var/lib/arhuaco/data/models/net_W_svm-%s"\ % time.strftime("%Y%m%d-%H%M%S") configuration['model_file_svm'] = "/var/lib/arhuaco/data/models/net_model_svm-%s.json"\ % time.strftime("%Y%m%d-%H%M%S") # Training dataset configuration['paths'] = [ "/var/lib/arhuaco/data/dns_normal.log", "/var/lib/arhuaco/data/dns_malicious.log" ] # "/var/lib/arhuaco/data/dns_malicious_generated.log"] configuration['pdf_paths'] = ["/var/lib/arhuaco/data/models/net_svm_accuracy-%s.pdf" % time.strftime("%Y%m%d-%H%M%S"), "/var/lib/arhuaco/data/models/net_svm_fpr-%s.pdf"\ % time.strftime("%Y%m%d-%H%M%S")] # Create objects # First create the sources of data data_helper = DataHelpers( data_source=configuration['paths'], label=None, tokens_per_line=configuration['tokens_per_line'], number_lines=configuration['number_lines'], samples_per_batch=configuration['samples_per_batch'], seed=configuration['seed']) # Apply the word2vec processing w2v = W2V() sentence_stream = data_helper.sentence_stream( configuration['samples_per_epoch']) params = w2v.train_word2vec_stream( sentence_stream, num_features=configuration['embedding_dim'], min_word_count=configuration['min_word_count'], context=configuration['context'], num_epochs=configuration['num_epochs']) embedding_weights = params[0] vocabulary = params[1] vocabulary_index = params[2] # Create the svm network object svm_bow = SVM(seed=configuration['seed'], samples_per_batch=configuration['samples_per_batch'], min_word_count=configuration['min_word_count'], context=configuration['context'], weights_file=configuration['weights_file_svm'], model_file=configuration['model_file_svm'], labels=None, verbose=configuration['verbose']) svm_bow.set_bow_params(embedding_weights=params[0], vocabulary=params[1], vocabulary_index=params[2]) # Buid the model svm_bow.build_model( learn_rate=configuration['learn_rate'], momentum=configuration['momentum'], decay=configuration['decay'], nesterov=configuration['nesterov'], regularizer_param=configuration['regularizer_param'], dropout_rate=configuration['dropout_prob'], embedding_dim=configuration['embedding_dim'], ) print("svm training") # Get the data sources training_generator = data_helper.get_data_BoW_chunk( vocabulary, configuration['labels_svm']) validation_generator = data_helper.get_data_BoW_chunk( vocabulary, configuration['labels_svm']) # Train and validate the model history_object = svm_bow.train_model(training_source=training_generator, validation_source=validation_generator, samples_per_epoch\ =configuration['samples_per_epoch'], number_epochs=configuration['num_epochs'], val_split=configuration['val_split']) # Test the model with new data # Create a new data source for validation with generated data configuration['paths'][1] = '/var/lib/arhuaco/data/dns_malicious.log' configuration['samples_per_epoch'] = 1000 validation_data_helper = DataHelpers( data_source=configuration['paths'], label=None, tokens_per_line=configuration['tokens_per_line'], number_lines=configuration['number_lines'], samples_per_batch=configuration['samples_per_batch'], seed=configuration['seed'] + 3) test_generator = validation_data_helper.get_data_BoW_chunk( vocabulary, configuration['labels_svm']) result = svm_bow.test_model( test_data_source=test_generator, samples_to_test=configuration['samples_per_epoch']) # Graphically plot the results plot = Plot() # Training vs validation accuracy plot.history2plot([ history_object.history['real_accuracy'], history_object.history['val_real_accuracy'] ], ['Training', 'Validation'], "svm accuracy", "Epoch", "Accuracy", configuration['pdf_paths'][0], 'lower right', [0, 9], [0.8, 1.0]) # Trainning vs validation fpr plot.history2plot([ history_object.history['false_pos_rate'], history_object.history['val_false_pos_rate'] ], ['Training', 'Validation'], "svm false positive rate", "Epoch", "False positive rate", configuration['pdf_paths'][1], 'upper right', [0, 9], [0, 0.2])
def analyze_network(): # Parameters seed = 5 model_variation = 'CNN-non-static' # Model Hyperparameters # Max lenght of one sentence max_length = 5 # Number of lines included in the # series n_gram = 1 # Total lenght of the classification # object sequence_length = max_length * n_gram # Size of the vector representing each word embedding_dim = 10 # Conv. Filters applied to the text filter_sizes = (2, 3) # Total filters used num_filters = 3 dropout_prob = (0.0, 0.0) # Number of neurons in the hidden layer hidden_dims = 10 # Training parameters number_samples = 5 samples_per_epoch = 1000 num_epochs = 100 val_split = 0.1 verbose = 2 # Word2Vec parameters, see train_word2vec # Minimum word count min_word_count = 1 # Number of words that make sense in the context context = 4 weights_file_conv = "/var/lib/arhuaco/data/models/net_W_conv-%s"\ % time.strftime("%Y%m%d-%H%M%S") model_file_conv = "/var/lib/arhuaco/data/models/net_model_conv-%s.json"\ % time.strftime("%Y%m%d-%H%M%S") # Training dataset paths = [ "/var/lib/arhuaco/data/dns_normal.log", #"/var/lib/arhuaco/data/dns_malicious.log"] "/var/lib/arhuaco/data/dns_malicious_generated.log" ] # Training labels labels_conv = [0, 1] # Create objects data_helpers = DataHelpers(paths, None, max_length, n_gram, number_samples, seed) w2v = W2V() sentence_stream = data_helpers.sentence_stream(samples_per_epoch) params = w2v.train_word2vec_stream(sentence_stream, num_features=embedding_dim, min_word_count=min_word_count, context=context, num_epochs=num_epochs) cnn_w2v = CnnW2v(seed, model_variation, sequence_length, embedding_dim, filter_sizes, num_filters, dropout_prob, hidden_dims, number_samples, num_epochs, val_split, min_word_count, context, weights_file_conv, model_file_conv, paths, None, data_helpers, verbose) cnn_w2v.get_data(params[0], params[1], params[2]) cnn_w2v.build_model() print("Convolutional network training") history_conv = cnn_w2v.train_model(samples_per_epoch, labels_conv) cnn_w2v.paths[1] = "/var/lib/arhuaco/data/dns_malicious.log" result = cnn_w2v.test_model(1000, labels_conv, max_length, n_gram) # Graphically plot the results plot = Plot() # Training vs validation plot.history2plot([history_conv.history['real_accuracy'], history_conv.history['val_real_accuracy']], ['Training', 'Validation'], "CNN accuracy", "Epoch", "Accuracy", "/var/lib/arhuaco/data/models/net_cnn_accuracy-%s.pdf"\ % time.strftime("%Y%m%d-%H%M%S"), location='lower right') # Trainning vs validation fpr plot.history2plot([history_conv.history['false_pos_rate'], history_conv.history['val_false_pos_rate']], ['Training', 'Validation'], "CNN false positive rate", "Epoch", "False positive rate", "/var/lib/arhuaco/data/models/net_cnn_fpr-%s.pdf"\ % time.strftime("%Y%m%d-%H%M%S"), location='upper right')