Beispiel #1
0
def _read_log_lines_then_yield(yielder, first_line):
    """ Read all provided log lines from the given yielder. """

    first_entry = LogEntry.from_log_string(first_line)
    log_entries = [first_entry]
    for line in yielder:
        log_entry = LogEntry.from_log_string(line)
        log_entries.append(log_entry)

    ids_entry_dict = IdsConverter().log_entries_to_ids_entries_dict(
        log_entries)

    for _, app_entries in ids_entry_dict.items():
        for ids_entry in app_entries:
            yield ids_entry
Beispiel #2
0
    def run(experiment):

        log_entries = []

        for line in Dir.yield_lines(experiment.file_path, ITEM_LIMIT):
            log_entry = LogEntry.from_log_string(line)
            log_entries.append(log_entry)

        experiment.entries = log_entries

        # Exp 1: map
        time_before_map = time.time()
        OneHotVsMapping.handle_log_entries("MAP", OneHotVsMappingConverter(),
                                           log_entries, experiment)
        # Exp 2: one-hot
        time_after_map_before_one_hot = time.time()
        OneHotVsMapping.handle_log_entries("OHOT", IdsConverter(), log_entries,
                                           experiment)
        time_after_all = time.time()

        time_for_map = time_after_map_before_one_hot - time_before_map
        time_for_one_hot = time_after_all - time_after_map_before_one_hot

        timing_lines = [
            "Benchmark result | %s entries processed | OneClassSVM classifier"
            % len(log_entries), "",
            "Mapping: %s" % util.fmtr.format_time_passed(time_for_map),
            "One-hot: %s" % util.fmtr.format_time_passed(time_for_one_hot)
        ]

        experiment.add_result_file("time_map_vs_onehot", timing_lines)
Beispiel #3
0
def proc_log_string(orig_log_string):
    """ Process a line. """

    if "COLOUR" not in orig_log_string:
        return orig_log_string

    log_entry = LogEntry.from_log_string(orig_log_string)
    if log_entry.data[LogEntry.APP_ID_FIELD] != "COLOUR":
        return orig_log_string

    message = log_entry.data[LogEntry.LOG_MESSAGE_FIELD]
    r, g, b = [float(x) for x in message.split(",")]

    if any([num not in range(0, 256) for num in (r, g, b)]):
        raise RuntimeError("FIX THIS SHIT! File corrupted.")

    if (r, g, b) in [(150, 140, 200), (170, 250, 140), (120, 180, 130),
                     (120, 180, 200)]:
        assert (log_entry.intrusion == "normal")
        return orig_log_string
    elif (r, g, b) in [(255, 0, 0), (200, 50, 50), (170, 80, 80)]:
        log_entry.set_any(intrusion="red")
        new_log_string = log_entry.get_log_string()
        return new_log_string
    else:
        raise NotImplementedError("Unexpected colour in line: %s" %
                                  orig_log_string)
Beispiel #4
0
def _get_log_entries_from_file(file_path, limit):
    """ Read up to <limit> number of log entries from the given file. """

    log_entries = []

    for line in Dir.yield_lines(file_path, limit):
        log_entries.append(LogEntry.from_log_string(line))

    return log_entries
Beispiel #5
0
def _detect_type(first_line):
    """ Detect the file type from the first line. """

    if first_line == HEADER:
        return FileType.IDSE_FILE

    try:
        _ = LogEntry.from_log_string(first_line)
        return FileType.LOG_FILE
    except ValueError:
        raise ValueError("Invalid file given! Can't parse:\n%s" % first_line)
Beispiel #6
0
def reservoir_sample_limit(item_generator, sample_size, limit_to):
    """
	Sample with 'Reservoir Sampling' from the given generator the given number of elements.
	*limit_to: List of data types to limit to.
	"""

    if any([l not in ids_data.get_app_ids() for l in limit_to]):
        raise ValueError("Given limits are invalid: %s" % limit_to)

    limited_generator = (
        line for line in item_generator
        # Convert line to LogEntry, check that app_id is in the allowed limits
        if log_entry_to_app_id(LogEntry.from_log_string(line)) in limit_to)

    return reservoir_sample(limited_generator, sample_size)
Beispiel #7
0
    def handle_all(experiment):
        """ Full flow for a one-fits-all classifier. """

        from ids.TEMP_IDS_CONVERTER import IdsConverter as TEMPCONVERTER
        converter = TEMPCONVERTER()
        log_entries = []

        for line in Dir.yield_lines(experiment.file_path, ITEM_LIMIT):
            log_entry = LogEntry.from_log_string(line)
            log_entries.append(log_entry)

        all_entries = converter.LOG_ENTRIES_TO_IDS_ENTRIES(log_entries,
                                                           binary=True)

        training_entries, scoring_entries = ids_tools.ids_entries_to_train_test(
            all_entries)
        X_train, _ = IdsConverter().ids_entries_to_X_y(training_entries)

        scoring_dict = {}
        for ids_entry in scoring_entries:
            if ids_entry.app_id not in scoring_dict:
                scoring_dict[ids_entry.app_id] = []
            scoring_dict[ids_entry.app_id].append(ids_entry)

        # Classify with all entries: training_entries
        classifiers = [sk_svm.OneClassSVM(), sk_ens.IsolationForest()]
        for classifier in classifiers:
            classifier.fit(X_train)

        # Score for each app: scoring_dict
        for app_id, app_entries in util.seqr.yield_items_in_key_order(
                scoring_dict):
            X_test, y_true = IdsConverter().ids_entries_to_X_y(app_entries)
            y_preds = [clf.predict(X_test) for clf in classifiers]
            for clf, y_pred in zip(classifiers, y_preds):
                experiment.visualise_store("ALL", app_id, clf, y_true, y_pred)
Beispiel #8
0
def analyse(file_path, to_file, output_printer):
	""" Analyse the given log file. """

	# Check output file if requested #

	output_path = file_path + ".analysis"

	if to_file and os.path.lexists(output_path):
		raise IOError("Output file {} exists already! (Re)Move it and try again.".format(output_path))

	output_printer.prt("Analysing...")

	# Get file access #

	file_type = idse_dao.detect_type(file_path)
	if file_type == idse_dao.FileType.IDSE_FILE:
		print("Can't analyse IDSE files!")
		return
	elif file_type != idse_dao.FileType.LOG_FILE:
		raise NotImplementedError("File type \"%s\" not implemented!" % file_type)

	log_entry_generator = (LogEntry.from_log_string(line) for line in  Dir.yield_lines(file_path))

	# Analysis #

	all_app_ids = ids_data.get_app_ids()
	all_classes = ids_data.get_labels()

	(
		total_entries, found_app_ids, entry_count_per_app_id, elements_per_class_per_app_id,
		found_classes, entry_count_per_class, app_ids_per_class, duplicate_elements_per_app_id,
		scorable_app_ids, dispersion_index, duplicate_index
	) = analyse_entries(log_entry_generator)

	# Output #

	printer = output_printer
	if to_file:
		printer = util.prtr.Storer()

	get_pl = lambda s, obj: s if len(obj) > 1 else ""
	total_line_name = "<total>"

	if not to_file:
		printer.prt("")

	printer.prt("Analysis {}: Found {:,} entries with {}/{} app id{} and {}/{} class{}".format(
		VERSION, total_entries,
		len(found_app_ids), len(all_app_ids), get_pl("s", found_app_ids),
		len(found_classes), len(all_classes), get_pl("es", found_classes))
	)

	# "Elements and classes per app ID" table
	per_app_id = []
	per_app_id.append(["App ID", "Elements", "El. %"] + all_classes)
	total_entries_assertion = 0
	for app_id in all_app_ids:
		total_entries_assertion += entry_count_per_app_id[app_id]

		line = [
			app_id,
			"{:,}".format(entry_count_per_app_id[app_id]),
			util.fmtr.format_percentage(entry_count_per_app_id[app_id] / float(total_entries), True, 2)
		]

		for a_class in all_classes:
			class_count_str = ""
			if a_class in elements_per_class_per_app_id[app_id]:
				class_count_str = "{:,}".format(elements_per_class_per_app_id[app_id][a_class])

			line.append(class_count_str)

		per_app_id.append(line)

	assert(total_entries == total_entries_assertion)

	empty_line = [""] * (3 + len(all_classes))
	per_app_id.append(empty_line)

	total_line = [
		total_line_name,
		"{:,}".format(total_entries),
		util.fmtr.format_percentage(1, True, 2)
	]
	for a_class in all_classes:
		total_line.append("{:,}".format(entry_count_per_class[a_class]))
	per_app_id.append(total_line)

	util.outp.print_table(per_app_id, headline="Elements and classes per app ID", printer=printer)

	# "per class" table
	per_class = []
	per_class.append([""] + all_classes)
	app_ids_line = ["App IDs"]
	percent_line = ["Percentage"]
	for a_class in all_classes:
		app_ids_line.append(
			len(app_ids_per_class[a_class]))
		percent_line.append(
			util.fmtr.format_percentage(entry_count_per_class[a_class] / float(total_entries), False, 2))

	per_class.append(app_ids_line)
	per_class.append(percent_line)

	util.outp.print_table(per_class, headline="Metrics per class", printer=printer)

	# "Duplicates per app ID" table
	duplicates = []
	duplicates.append(["App ID", "All", "Unique", "Duplicates", "Duplicate %"])
	total_number_of_duplicates = 0
	total_entries_assertion = 0
	for app_id in all_app_ids:
		result = duplicate_elements_per_app_id[app_id]
		unique_count = result["uniq"]
		duplicate_count = result["dupe"]
		all_count = unique_count + duplicate_count

		total_number_of_duplicates += duplicate_count
		total_entries_assertion += all_count

		duplicate_percent = 0
		if all_count > 0:
			duplicate_percent = float(duplicate_count) / all_count
		duplicate_percent_str = util.fmtr.format_percentage(duplicate_percent, True, 3)

		new_line = [app_id]
		new_line.extend(["{:,}".format(x) for x in [all_count, unique_count, duplicate_count]])
		new_line.append(duplicate_percent_str)
		duplicates.append(new_line)

	assert(total_entries == total_entries_assertion)

	# Don't output table if there are no duplicates
	if total_number_of_duplicates == 0:
		printer.prt("\nDuplicate analysis: No duplicates found!")
	else:
		empty_line = [""] * 5
		duplicates.append(empty_line)

		total_duplicate_percent = float(total_number_of_duplicates) / total_entries
		total_line = [total_line_name]
		total_line.extend([
			"{:,}".format(x) for x in
			[total_entries, total_entries - total_number_of_duplicates, total_number_of_duplicates]
		])
		total_line.append(util.fmtr.format_percentage(total_duplicate_percent, True, 3))
		duplicates.append(total_line)

		util.outp.print_table(duplicates, headline="Duplicates per app ID", printer=printer)

	printer.prt("\nScores for %s scorable app ids: Dispersion index = %s | Duplicate index = %s"
		% (len(scorable_app_ids), round(dispersion_index, 3), round(duplicate_index, 3))
	)
	printer.prt("Scorable app ids: %s" % scorable_app_ids)

	if to_file:
		with open(output_path, "w") as output_file:
			for line in printer.get_messages():
				output_file.write(line + "\n")

		output_printer.prt("Successfully saved analysis to \"{}\".".format(output_path))

	# harmonious? all labelled / some / none?
	# for each app id: are there roughly the same number of entries per class?
	return
Beispiel #9
0
def _yield_log_entries_from_file(file_path):
    for line in Dir.yield_lines(file_path):
        yield LogEntry.from_log_string(line)