def _read_log_lines_then_yield(yielder, first_line): """ Read all provided log lines from the given yielder. """ first_entry = LogEntry.from_log_string(first_line) log_entries = [first_entry] for line in yielder: log_entry = LogEntry.from_log_string(line) log_entries.append(log_entry) ids_entry_dict = IdsConverter().log_entries_to_ids_entries_dict( log_entries) for _, app_entries in ids_entry_dict.items(): for ids_entry in app_entries: yield ids_entry
def run(experiment): log_entries = [] for line in Dir.yield_lines(experiment.file_path, ITEM_LIMIT): log_entry = LogEntry.from_log_string(line) log_entries.append(log_entry) experiment.entries = log_entries # Exp 1: map time_before_map = time.time() OneHotVsMapping.handle_log_entries("MAP", OneHotVsMappingConverter(), log_entries, experiment) # Exp 2: one-hot time_after_map_before_one_hot = time.time() OneHotVsMapping.handle_log_entries("OHOT", IdsConverter(), log_entries, experiment) time_after_all = time.time() time_for_map = time_after_map_before_one_hot - time_before_map time_for_one_hot = time_after_all - time_after_map_before_one_hot timing_lines = [ "Benchmark result | %s entries processed | OneClassSVM classifier" % len(log_entries), "", "Mapping: %s" % util.fmtr.format_time_passed(time_for_map), "One-hot: %s" % util.fmtr.format_time_passed(time_for_one_hot) ] experiment.add_result_file("time_map_vs_onehot", timing_lines)
def proc_log_string(orig_log_string): """ Process a line. """ if "COLOUR" not in orig_log_string: return orig_log_string log_entry = LogEntry.from_log_string(orig_log_string) if log_entry.data[LogEntry.APP_ID_FIELD] != "COLOUR": return orig_log_string message = log_entry.data[LogEntry.LOG_MESSAGE_FIELD] r, g, b = [float(x) for x in message.split(",")] if any([num not in range(0, 256) for num in (r, g, b)]): raise RuntimeError("FIX THIS SHIT! File corrupted.") if (r, g, b) in [(150, 140, 200), (170, 250, 140), (120, 180, 130), (120, 180, 200)]: assert (log_entry.intrusion == "normal") return orig_log_string elif (r, g, b) in [(255, 0, 0), (200, 50, 50), (170, 80, 80)]: log_entry.set_any(intrusion="red") new_log_string = log_entry.get_log_string() return new_log_string else: raise NotImplementedError("Unexpected colour in line: %s" % orig_log_string)
def _get_log_entries_from_file(file_path, limit): """ Read up to <limit> number of log entries from the given file. """ log_entries = [] for line in Dir.yield_lines(file_path, limit): log_entries.append(LogEntry.from_log_string(line)) return log_entries
def _detect_type(first_line): """ Detect the file type from the first line. """ if first_line == HEADER: return FileType.IDSE_FILE try: _ = LogEntry.from_log_string(first_line) return FileType.LOG_FILE except ValueError: raise ValueError("Invalid file given! Can't parse:\n%s" % first_line)
def reservoir_sample_limit(item_generator, sample_size, limit_to): """ Sample with 'Reservoir Sampling' from the given generator the given number of elements. *limit_to: List of data types to limit to. """ if any([l not in ids_data.get_app_ids() for l in limit_to]): raise ValueError("Given limits are invalid: %s" % limit_to) limited_generator = ( line for line in item_generator # Convert line to LogEntry, check that app_id is in the allowed limits if log_entry_to_app_id(LogEntry.from_log_string(line)) in limit_to) return reservoir_sample(limited_generator, sample_size)
def handle_all(experiment): """ Full flow for a one-fits-all classifier. """ from ids.TEMP_IDS_CONVERTER import IdsConverter as TEMPCONVERTER converter = TEMPCONVERTER() log_entries = [] for line in Dir.yield_lines(experiment.file_path, ITEM_LIMIT): log_entry = LogEntry.from_log_string(line) log_entries.append(log_entry) all_entries = converter.LOG_ENTRIES_TO_IDS_ENTRIES(log_entries, binary=True) training_entries, scoring_entries = ids_tools.ids_entries_to_train_test( all_entries) X_train, _ = IdsConverter().ids_entries_to_X_y(training_entries) scoring_dict = {} for ids_entry in scoring_entries: if ids_entry.app_id not in scoring_dict: scoring_dict[ids_entry.app_id] = [] scoring_dict[ids_entry.app_id].append(ids_entry) # Classify with all entries: training_entries classifiers = [sk_svm.OneClassSVM(), sk_ens.IsolationForest()] for classifier in classifiers: classifier.fit(X_train) # Score for each app: scoring_dict for app_id, app_entries in util.seqr.yield_items_in_key_order( scoring_dict): X_test, y_true = IdsConverter().ids_entries_to_X_y(app_entries) y_preds = [clf.predict(X_test) for clf in classifiers] for clf, y_pred in zip(classifiers, y_preds): experiment.visualise_store("ALL", app_id, clf, y_true, y_pred)
def analyse(file_path, to_file, output_printer): """ Analyse the given log file. """ # Check output file if requested # output_path = file_path + ".analysis" if to_file and os.path.lexists(output_path): raise IOError("Output file {} exists already! (Re)Move it and try again.".format(output_path)) output_printer.prt("Analysing...") # Get file access # file_type = idse_dao.detect_type(file_path) if file_type == idse_dao.FileType.IDSE_FILE: print("Can't analyse IDSE files!") return elif file_type != idse_dao.FileType.LOG_FILE: raise NotImplementedError("File type \"%s\" not implemented!" % file_type) log_entry_generator = (LogEntry.from_log_string(line) for line in Dir.yield_lines(file_path)) # Analysis # all_app_ids = ids_data.get_app_ids() all_classes = ids_data.get_labels() ( total_entries, found_app_ids, entry_count_per_app_id, elements_per_class_per_app_id, found_classes, entry_count_per_class, app_ids_per_class, duplicate_elements_per_app_id, scorable_app_ids, dispersion_index, duplicate_index ) = analyse_entries(log_entry_generator) # Output # printer = output_printer if to_file: printer = util.prtr.Storer() get_pl = lambda s, obj: s if len(obj) > 1 else "" total_line_name = "<total>" if not to_file: printer.prt("") printer.prt("Analysis {}: Found {:,} entries with {}/{} app id{} and {}/{} class{}".format( VERSION, total_entries, len(found_app_ids), len(all_app_ids), get_pl("s", found_app_ids), len(found_classes), len(all_classes), get_pl("es", found_classes)) ) # "Elements and classes per app ID" table per_app_id = [] per_app_id.append(["App ID", "Elements", "El. %"] + all_classes) total_entries_assertion = 0 for app_id in all_app_ids: total_entries_assertion += entry_count_per_app_id[app_id] line = [ app_id, "{:,}".format(entry_count_per_app_id[app_id]), util.fmtr.format_percentage(entry_count_per_app_id[app_id] / float(total_entries), True, 2) ] for a_class in all_classes: class_count_str = "" if a_class in elements_per_class_per_app_id[app_id]: class_count_str = "{:,}".format(elements_per_class_per_app_id[app_id][a_class]) line.append(class_count_str) per_app_id.append(line) assert(total_entries == total_entries_assertion) empty_line = [""] * (3 + len(all_classes)) per_app_id.append(empty_line) total_line = [ total_line_name, "{:,}".format(total_entries), util.fmtr.format_percentage(1, True, 2) ] for a_class in all_classes: total_line.append("{:,}".format(entry_count_per_class[a_class])) per_app_id.append(total_line) util.outp.print_table(per_app_id, headline="Elements and classes per app ID", printer=printer) # "per class" table per_class = [] per_class.append([""] + all_classes) app_ids_line = ["App IDs"] percent_line = ["Percentage"] for a_class in all_classes: app_ids_line.append( len(app_ids_per_class[a_class])) percent_line.append( util.fmtr.format_percentage(entry_count_per_class[a_class] / float(total_entries), False, 2)) per_class.append(app_ids_line) per_class.append(percent_line) util.outp.print_table(per_class, headline="Metrics per class", printer=printer) # "Duplicates per app ID" table duplicates = [] duplicates.append(["App ID", "All", "Unique", "Duplicates", "Duplicate %"]) total_number_of_duplicates = 0 total_entries_assertion = 0 for app_id in all_app_ids: result = duplicate_elements_per_app_id[app_id] unique_count = result["uniq"] duplicate_count = result["dupe"] all_count = unique_count + duplicate_count total_number_of_duplicates += duplicate_count total_entries_assertion += all_count duplicate_percent = 0 if all_count > 0: duplicate_percent = float(duplicate_count) / all_count duplicate_percent_str = util.fmtr.format_percentage(duplicate_percent, True, 3) new_line = [app_id] new_line.extend(["{:,}".format(x) for x in [all_count, unique_count, duplicate_count]]) new_line.append(duplicate_percent_str) duplicates.append(new_line) assert(total_entries == total_entries_assertion) # Don't output table if there are no duplicates if total_number_of_duplicates == 0: printer.prt("\nDuplicate analysis: No duplicates found!") else: empty_line = [""] * 5 duplicates.append(empty_line) total_duplicate_percent = float(total_number_of_duplicates) / total_entries total_line = [total_line_name] total_line.extend([ "{:,}".format(x) for x in [total_entries, total_entries - total_number_of_duplicates, total_number_of_duplicates] ]) total_line.append(util.fmtr.format_percentage(total_duplicate_percent, True, 3)) duplicates.append(total_line) util.outp.print_table(duplicates, headline="Duplicates per app ID", printer=printer) printer.prt("\nScores for %s scorable app ids: Dispersion index = %s | Duplicate index = %s" % (len(scorable_app_ids), round(dispersion_index, 3), round(duplicate_index, 3)) ) printer.prt("Scorable app ids: %s" % scorable_app_ids) if to_file: with open(output_path, "w") as output_file: for line in printer.get_messages(): output_file.write(line + "\n") output_printer.prt("Successfully saved analysis to \"{}\".".format(output_path)) # harmonious? all labelled / some / none? # for each app id: are there roughly the same number of entries per class? return
def _yield_log_entries_from_file(file_path): for line in Dir.yield_lines(file_path): yield LogEntry.from_log_string(line)