Example #1
0
    def run(experiment):

        log_entries = []

        for line in Dir.yield_lines(experiment.file_path, ITEM_LIMIT):
            log_entry = LogEntry.from_log_string(line)
            log_entries.append(log_entry)

        experiment.entries = log_entries

        # Exp 1: map
        time_before_map = time.time()
        OneHotVsMapping.handle_log_entries("MAP", OneHotVsMappingConverter(),
                                           log_entries, experiment)
        # Exp 2: one-hot
        time_after_map_before_one_hot = time.time()
        OneHotVsMapping.handle_log_entries("OHOT", IdsConverter(), log_entries,
                                           experiment)
        time_after_all = time.time()

        time_for_map = time_after_map_before_one_hot - time_before_map
        time_for_one_hot = time_after_all - time_after_map_before_one_hot

        timing_lines = [
            "Benchmark result | %s entries processed | OneClassSVM classifier"
            % len(log_entries), "",
            "Mapping: %s" % util.fmtr.format_time_passed(time_for_map),
            "One-hot: %s" % util.fmtr.format_time_passed(time_for_one_hot)
        ]

        experiment.add_result_file("time_map_vs_onehot", timing_lines)
Example #2
0
def _get_log_entries_from_file(file_path, limit):
    """ Read up to <limit> number of log entries from the given file. """

    log_entries = []

    for line in Dir.yield_lines(file_path, limit):
        log_entries.append(LogEntry.from_log_string(line))

    return log_entries
Example #3
0
def yield_entries(file_path, limit=None):
    """
	Yield IdsEntry objects from the given file. First access on log files is costly!
	*limit: Optional maximum number of entries to retrieve.
	"""

    if not os.path.lexists(file_path):
        _raise_file_doesnt_exist(file_path)

    yielder = Dir.yield_lines(file_path, limit)

    first_line = yielder.next()
    file_type = _detect_type(first_line)

    if file_type == FileType.IDSE_FILE:
        return _yield_idse_lines(yielder)
    elif file_type == FileType.LOG_FILE:
        return _read_log_lines_then_yield(yielder, first_line)
    else:
        raise NotImplementedError("File type not implemented: %s" % file_type)
Example #4
0
def _sample(file_path, number_of_elements, limit_to):
	""" Sample <number_of_elements> from the given file. """

	print("Sampling...")

	target_file_path = "%s_%s-sample" % (file_path, number_of_elements)

	if not os.path.lexists(file_path):
		raise IOError("Input file doesn't exist")

	target_file_path = Dir.uniquify(target_file_path)

	line_generator = Dir.yield_lines(file_path)

	log_lines = None
	if limit_to is None:
		log_lines = ids_tools.reservoir_sample(line_generator, number_of_elements)
	else:
		log_lines = ids_tools.reservoir_sample_limit(line_generator, number_of_elements, limit_to)

	Dir.write_lines(target_file_path, log_lines)

	print("Done. Wrote to file:\n%s" % target_file_path)
Example #5
0
    def handle_all(experiment):
        """ Full flow for a one-fits-all classifier. """

        from ids.TEMP_IDS_CONVERTER import IdsConverter as TEMPCONVERTER
        converter = TEMPCONVERTER()
        log_entries = []

        for line in Dir.yield_lines(experiment.file_path, ITEM_LIMIT):
            log_entry = LogEntry.from_log_string(line)
            log_entries.append(log_entry)

        all_entries = converter.LOG_ENTRIES_TO_IDS_ENTRIES(log_entries,
                                                           binary=True)

        training_entries, scoring_entries = ids_tools.ids_entries_to_train_test(
            all_entries)
        X_train, _ = IdsConverter().ids_entries_to_X_y(training_entries)

        scoring_dict = {}
        for ids_entry in scoring_entries:
            if ids_entry.app_id not in scoring_dict:
                scoring_dict[ids_entry.app_id] = []
            scoring_dict[ids_entry.app_id].append(ids_entry)

        # Classify with all entries: training_entries
        classifiers = [sk_svm.OneClassSVM(), sk_ens.IsolationForest()]
        for classifier in classifiers:
            classifier.fit(X_train)

        # Score for each app: scoring_dict
        for app_id, app_entries in util.seqr.yield_items_in_key_order(
                scoring_dict):
            X_test, y_true = IdsConverter().ids_entries_to_X_y(app_entries)
            y_preds = [clf.predict(X_test) for clf in classifiers]
            for clf, y_pred in zip(classifiers, y_preds):
                experiment.visualise_store("ALL", app_id, clf, y_true, y_pred)
Example #6
0
def analyse(file_path, to_file, output_printer):
	""" Analyse the given log file. """

	# Check output file if requested #

	output_path = file_path + ".analysis"

	if to_file and os.path.lexists(output_path):
		raise IOError("Output file {} exists already! (Re)Move it and try again.".format(output_path))

	output_printer.prt("Analysing...")

	# Get file access #

	file_type = idse_dao.detect_type(file_path)
	if file_type == idse_dao.FileType.IDSE_FILE:
		print("Can't analyse IDSE files!")
		return
	elif file_type != idse_dao.FileType.LOG_FILE:
		raise NotImplementedError("File type \"%s\" not implemented!" % file_type)

	log_entry_generator = (LogEntry.from_log_string(line) for line in  Dir.yield_lines(file_path))

	# Analysis #

	all_app_ids = ids_data.get_app_ids()
	all_classes = ids_data.get_labels()

	(
		total_entries, found_app_ids, entry_count_per_app_id, elements_per_class_per_app_id,
		found_classes, entry_count_per_class, app_ids_per_class, duplicate_elements_per_app_id,
		scorable_app_ids, dispersion_index, duplicate_index
	) = analyse_entries(log_entry_generator)

	# Output #

	printer = output_printer
	if to_file:
		printer = util.prtr.Storer()

	get_pl = lambda s, obj: s if len(obj) > 1 else ""
	total_line_name = "<total>"

	if not to_file:
		printer.prt("")

	printer.prt("Analysis {}: Found {:,} entries with {}/{} app id{} and {}/{} class{}".format(
		VERSION, total_entries,
		len(found_app_ids), len(all_app_ids), get_pl("s", found_app_ids),
		len(found_classes), len(all_classes), get_pl("es", found_classes))
	)

	# "Elements and classes per app ID" table
	per_app_id = []
	per_app_id.append(["App ID", "Elements", "El. %"] + all_classes)
	total_entries_assertion = 0
	for app_id in all_app_ids:
		total_entries_assertion += entry_count_per_app_id[app_id]

		line = [
			app_id,
			"{:,}".format(entry_count_per_app_id[app_id]),
			util.fmtr.format_percentage(entry_count_per_app_id[app_id] / float(total_entries), True, 2)
		]

		for a_class in all_classes:
			class_count_str = ""
			if a_class in elements_per_class_per_app_id[app_id]:
				class_count_str = "{:,}".format(elements_per_class_per_app_id[app_id][a_class])

			line.append(class_count_str)

		per_app_id.append(line)

	assert(total_entries == total_entries_assertion)

	empty_line = [""] * (3 + len(all_classes))
	per_app_id.append(empty_line)

	total_line = [
		total_line_name,
		"{:,}".format(total_entries),
		util.fmtr.format_percentage(1, True, 2)
	]
	for a_class in all_classes:
		total_line.append("{:,}".format(entry_count_per_class[a_class]))
	per_app_id.append(total_line)

	util.outp.print_table(per_app_id, headline="Elements and classes per app ID", printer=printer)

	# "per class" table
	per_class = []
	per_class.append([""] + all_classes)
	app_ids_line = ["App IDs"]
	percent_line = ["Percentage"]
	for a_class in all_classes:
		app_ids_line.append(
			len(app_ids_per_class[a_class]))
		percent_line.append(
			util.fmtr.format_percentage(entry_count_per_class[a_class] / float(total_entries), False, 2))

	per_class.append(app_ids_line)
	per_class.append(percent_line)

	util.outp.print_table(per_class, headline="Metrics per class", printer=printer)

	# "Duplicates per app ID" table
	duplicates = []
	duplicates.append(["App ID", "All", "Unique", "Duplicates", "Duplicate %"])
	total_number_of_duplicates = 0
	total_entries_assertion = 0
	for app_id in all_app_ids:
		result = duplicate_elements_per_app_id[app_id]
		unique_count = result["uniq"]
		duplicate_count = result["dupe"]
		all_count = unique_count + duplicate_count

		total_number_of_duplicates += duplicate_count
		total_entries_assertion += all_count

		duplicate_percent = 0
		if all_count > 0:
			duplicate_percent = float(duplicate_count) / all_count
		duplicate_percent_str = util.fmtr.format_percentage(duplicate_percent, True, 3)

		new_line = [app_id]
		new_line.extend(["{:,}".format(x) for x in [all_count, unique_count, duplicate_count]])
		new_line.append(duplicate_percent_str)
		duplicates.append(new_line)

	assert(total_entries == total_entries_assertion)

	# Don't output table if there are no duplicates
	if total_number_of_duplicates == 0:
		printer.prt("\nDuplicate analysis: No duplicates found!")
	else:
		empty_line = [""] * 5
		duplicates.append(empty_line)

		total_duplicate_percent = float(total_number_of_duplicates) / total_entries
		total_line = [total_line_name]
		total_line.extend([
			"{:,}".format(x) for x in
			[total_entries, total_entries - total_number_of_duplicates, total_number_of_duplicates]
		])
		total_line.append(util.fmtr.format_percentage(total_duplicate_percent, True, 3))
		duplicates.append(total_line)

		util.outp.print_table(duplicates, headline="Duplicates per app ID", printer=printer)

	printer.prt("\nScores for %s scorable app ids: Dispersion index = %s | Duplicate index = %s"
		% (len(scorable_app_ids), round(dispersion_index, 3), round(duplicate_index, 3))
	)
	printer.prt("Scorable app ids: %s" % scorable_app_ids)

	if to_file:
		with open(output_path, "w") as output_file:
			for line in printer.get_messages():
				output_file.write(line + "\n")

		output_printer.prt("Successfully saved analysis to \"{}\".".format(output_path))

	# harmonious? all labelled / some / none?
	# for each app id: are there roughly the same number of entries per class?
	return
Example #7
0
def detect_type(file_path):
    """ Detect the file type of the file. """

    first_line = Dir.yield_lines(file_path).next()
    return _detect_type(first_line)
Example #8
0
def _yield_log_entries_from_file(file_path):
    for line in Dir.yield_lines(file_path):
        yield LogEntry.from_log_string(line)
Example #9
0
    print(msg)
    exit()


### Main program ###

p = argparse.ArgumentParser()
p.add_argument("file_path", metavar="PATH/FILE", help="Log file")
args = p.parse_args()

orig_path = os.path.expanduser(args.file_path)

if not os.path.lexists(orig_path):
    prexit("File doesn't exist")

tmp_path = orig_path + "_bak"

if os.path.lexists(tmp_path):
    prexit("%s exists" % tmp_path)

os.rename(orig_path, tmp_path)

with open(orig_path, "w") as output_file:
    for line in Dir.yield_lines(tmp_path):
        processed_string = proc_log_string(line)
        output_file.write("%s\n" % processed_string)

print("Done.")
print("Wrote to: %s" % orig_path)
print("Old file: %s" % tmp_path)