Esempio n. 1
0
    def run(experiment):

        log_entries = []

        for line in Dir.yield_lines(experiment.file_path, ITEM_LIMIT):
            log_entry = LogEntry.from_log_string(line)
            log_entries.append(log_entry)

        experiment.entries = log_entries

        # Exp 1: map
        time_before_map = time.time()
        OneHotVsMapping.handle_log_entries("MAP", OneHotVsMappingConverter(),
                                           log_entries, experiment)
        # Exp 2: one-hot
        time_after_map_before_one_hot = time.time()
        OneHotVsMapping.handle_log_entries("OHOT", IdsConverter(), log_entries,
                                           experiment)
        time_after_all = time.time()

        time_for_map = time_after_map_before_one_hot - time_before_map
        time_for_one_hot = time_after_all - time_after_map_before_one_hot

        timing_lines = [
            "Benchmark result | %s entries processed | OneClassSVM classifier"
            % len(log_entries), "",
            "Mapping: %s" % util.fmtr.format_time_passed(time_for_map),
            "One-hot: %s" % util.fmtr.format_time_passed(time_for_one_hot)
        ]

        experiment.add_result_file("time_map_vs_onehot", timing_lines)
Esempio n. 2
0
def _get_log_entries_from_file(file_path, limit):
    """ Read up to <limit> number of log entries from the given file. """

    log_entries = []

    for line in Dir.yield_lines(file_path, limit):
        log_entries.append(LogEntry.from_log_string(line))

    return log_entries
Esempio n. 3
0
    def __init__(self, file_path, store_title):
        """ Ctor """

        object.__init__(self)

        # State
        self.title = None
        # Time
        self.start_time = time.time()
        self.end_time = None
        # Loaded entries
        self.entries = []
        # ClassifierResultGroup objects (name, classifier, result)
        self.classifier_results = []
        # OtherResult objects (file_name lines)
        self.other_result_files = []

        # StorerAndPrinter - stores and prints ;)
        time_printer = util.prtr.TimePrinter(name="exp")
        self.storer_printer = util.prtr.StorerAndPrinter(printer=time_printer)

        # Paths and name
        self.file_path = os.path.expanduser(file_path)
        self.input_file_name = os.path.basename(self.file_path)
        experiment_dir_name = None

        if not os.path.lexists(self.file_path):
            util.outp.exit_on_error("Input file not found: %s" %
                                    self.file_path)

        self.title = store_title
        if self.title is None:
            random_num_str = "".join(
                str(x) for x in (random.sample(range(0, 15), 5)))
            self.title = "Experiment %s" % random_num_str

        experiment_dir_name = Dir.remove_disallowed_characters(
            self.title.lower())
        experiment_dir_name += time.strftime("_%m-%d_%H-%M")
        self.experiment_dir_path = self.get_experiment_folder(
            experiment_dir_name)

        if os.path.lexists(self.experiment_dir_path):
            self.experiment_dir_path = Dir.uniquify(self.experiment_dir_path)
Esempio n. 4
0
def save_entries(file_path, entry_generator):
    """
	Store the entries as a file. IDS entries in IDSE files, log entries as log files.
	returns: The file path in which the file was saved.
	"""

    entries_list = list(entry_generator)
    first_entry = entries_list[0]

    # Where to store?
    file_path_full = None
    # What to store?
    lines = None
    # How to convert?
    to_line = None

    # LogEntry objects: No extension, no header, call entry.get_log_string()
    if isinstance(first_entry, LogEntry):
        file_path_full = file_path
        lines = []
        to_line = lambda l: l.get_log_string()
    # IdsEntry objects: IDSE extension, IDSE header and run _ids_entry_to_idse_string(entry)
    elif isinstance(first_entry, IdsEntry):
        file_path_full = add_idse_extension(file_path)
        lines = [HEADER]
        to_line = _ids_entry_to_idse_string
    else:
        raise TypeError(
            "[IDSE DAO] Given elements are neither LogEntry nor IdsEntry objects!"
        )

    if os.path.lexists(file_path_full):
        _raise_file_exists(file_path_full)

    # Actual entry -> string conversion
    lines.extend([to_line(e) for e in entries_list])
    Dir.write_lines(file_path_full, lines)

    return file_path_full
Esempio n. 5
0
def _sample(file_path, number_of_elements, limit_to):
	""" Sample <number_of_elements> from the given file. """

	print("Sampling...")

	target_file_path = "%s_%s-sample" % (file_path, number_of_elements)

	if not os.path.lexists(file_path):
		raise IOError("Input file doesn't exist")

	target_file_path = Dir.uniquify(target_file_path)

	line_generator = Dir.yield_lines(file_path)

	log_lines = None
	if limit_to is None:
		log_lines = ids_tools.reservoir_sample(line_generator, number_of_elements)
	else:
		log_lines = ids_tools.reservoir_sample_limit(line_generator, number_of_elements, limit_to)

	Dir.write_lines(target_file_path, log_lines)

	print("Done. Wrote to file:\n%s" % target_file_path)
Esempio n. 6
0
def yield_entries(file_path, limit=None):
    """
	Yield IdsEntry objects from the given file. First access on log files is costly!
	*limit: Optional maximum number of entries to retrieve.
	"""

    if not os.path.lexists(file_path):
        _raise_file_doesnt_exist(file_path)

    yielder = Dir.yield_lines(file_path, limit)

    first_line = yielder.next()
    file_type = _detect_type(first_line)

    if file_type == FileType.IDSE_FILE:
        return _yield_idse_lines(yielder)
    elif file_type == FileType.LOG_FILE:
        return _read_log_lines_then_yield(yielder, first_line)
    else:
        raise NotImplementedError("File type not implemented: %s" % file_type)
Esempio n. 7
0
def _train(file_path):
    """ Train the classifier with the given file. """

    print("Using file \"{}\"".format(os.path.join(os.getcwd(), file_path)))

    saved_so_far = []

    if os.path.lexists(_HISTORY_FILE):
        saved_so_far = Dir.read_lines(_HISTORY_FILE)

    if file_path in saved_so_far:
        print("This file has already been used for training." +
              " If you think this is a mistake, rename it and run again.")
        return

    log_entry_generator = _yield_log_entries_from_file(file_path)
    _train_entries(log_entry_generator)

    with open(_HISTORY_FILE, 'a') as hist_file:
        hist_file.write(file_path + "\n")
Esempio n. 8
0
    def handle_all(experiment):
        """ Full flow for a one-fits-all classifier. """

        from ids.TEMP_IDS_CONVERTER import IdsConverter as TEMPCONVERTER
        converter = TEMPCONVERTER()
        log_entries = []

        for line in Dir.yield_lines(experiment.file_path, ITEM_LIMIT):
            log_entry = LogEntry.from_log_string(line)
            log_entries.append(log_entry)

        all_entries = converter.LOG_ENTRIES_TO_IDS_ENTRIES(log_entries,
                                                           binary=True)

        training_entries, scoring_entries = ids_tools.ids_entries_to_train_test(
            all_entries)
        X_train, _ = IdsConverter().ids_entries_to_X_y(training_entries)

        scoring_dict = {}
        for ids_entry in scoring_entries:
            if ids_entry.app_id not in scoring_dict:
                scoring_dict[ids_entry.app_id] = []
            scoring_dict[ids_entry.app_id].append(ids_entry)

        # Classify with all entries: training_entries
        classifiers = [sk_svm.OneClassSVM(), sk_ens.IsolationForest()]
        for classifier in classifiers:
            classifier.fit(X_train)

        # Score for each app: scoring_dict
        for app_id, app_entries in util.seqr.yield_items_in_key_order(
                scoring_dict):
            X_test, y_true = IdsConverter().ids_entries_to_X_y(app_entries)
            y_preds = [clf.predict(X_test) for clf in classifiers]
            for clf, y_pred in zip(classifiers, y_preds):
                experiment.visualise_store("ALL", app_id, clf, y_true, y_pred)
Esempio n. 9
0
def analyse(file_path, to_file, output_printer):
	""" Analyse the given log file. """

	# Check output file if requested #

	output_path = file_path + ".analysis"

	if to_file and os.path.lexists(output_path):
		raise IOError("Output file {} exists already! (Re)Move it and try again.".format(output_path))

	output_printer.prt("Analysing...")

	# Get file access #

	file_type = idse_dao.detect_type(file_path)
	if file_type == idse_dao.FileType.IDSE_FILE:
		print("Can't analyse IDSE files!")
		return
	elif file_type != idse_dao.FileType.LOG_FILE:
		raise NotImplementedError("File type \"%s\" not implemented!" % file_type)

	log_entry_generator = (LogEntry.from_log_string(line) for line in  Dir.yield_lines(file_path))

	# Analysis #

	all_app_ids = ids_data.get_app_ids()
	all_classes = ids_data.get_labels()

	(
		total_entries, found_app_ids, entry_count_per_app_id, elements_per_class_per_app_id,
		found_classes, entry_count_per_class, app_ids_per_class, duplicate_elements_per_app_id,
		scorable_app_ids, dispersion_index, duplicate_index
	) = analyse_entries(log_entry_generator)

	# Output #

	printer = output_printer
	if to_file:
		printer = util.prtr.Storer()

	get_pl = lambda s, obj: s if len(obj) > 1 else ""
	total_line_name = "<total>"

	if not to_file:
		printer.prt("")

	printer.prt("Analysis {}: Found {:,} entries with {}/{} app id{} and {}/{} class{}".format(
		VERSION, total_entries,
		len(found_app_ids), len(all_app_ids), get_pl("s", found_app_ids),
		len(found_classes), len(all_classes), get_pl("es", found_classes))
	)

	# "Elements and classes per app ID" table
	per_app_id = []
	per_app_id.append(["App ID", "Elements", "El. %"] + all_classes)
	total_entries_assertion = 0
	for app_id in all_app_ids:
		total_entries_assertion += entry_count_per_app_id[app_id]

		line = [
			app_id,
			"{:,}".format(entry_count_per_app_id[app_id]),
			util.fmtr.format_percentage(entry_count_per_app_id[app_id] / float(total_entries), True, 2)
		]

		for a_class in all_classes:
			class_count_str = ""
			if a_class in elements_per_class_per_app_id[app_id]:
				class_count_str = "{:,}".format(elements_per_class_per_app_id[app_id][a_class])

			line.append(class_count_str)

		per_app_id.append(line)

	assert(total_entries == total_entries_assertion)

	empty_line = [""] * (3 + len(all_classes))
	per_app_id.append(empty_line)

	total_line = [
		total_line_name,
		"{:,}".format(total_entries),
		util.fmtr.format_percentage(1, True, 2)
	]
	for a_class in all_classes:
		total_line.append("{:,}".format(entry_count_per_class[a_class]))
	per_app_id.append(total_line)

	util.outp.print_table(per_app_id, headline="Elements and classes per app ID", printer=printer)

	# "per class" table
	per_class = []
	per_class.append([""] + all_classes)
	app_ids_line = ["App IDs"]
	percent_line = ["Percentage"]
	for a_class in all_classes:
		app_ids_line.append(
			len(app_ids_per_class[a_class]))
		percent_line.append(
			util.fmtr.format_percentage(entry_count_per_class[a_class] / float(total_entries), False, 2))

	per_class.append(app_ids_line)
	per_class.append(percent_line)

	util.outp.print_table(per_class, headline="Metrics per class", printer=printer)

	# "Duplicates per app ID" table
	duplicates = []
	duplicates.append(["App ID", "All", "Unique", "Duplicates", "Duplicate %"])
	total_number_of_duplicates = 0
	total_entries_assertion = 0
	for app_id in all_app_ids:
		result = duplicate_elements_per_app_id[app_id]
		unique_count = result["uniq"]
		duplicate_count = result["dupe"]
		all_count = unique_count + duplicate_count

		total_number_of_duplicates += duplicate_count
		total_entries_assertion += all_count

		duplicate_percent = 0
		if all_count > 0:
			duplicate_percent = float(duplicate_count) / all_count
		duplicate_percent_str = util.fmtr.format_percentage(duplicate_percent, True, 3)

		new_line = [app_id]
		new_line.extend(["{:,}".format(x) for x in [all_count, unique_count, duplicate_count]])
		new_line.append(duplicate_percent_str)
		duplicates.append(new_line)

	assert(total_entries == total_entries_assertion)

	# Don't output table if there are no duplicates
	if total_number_of_duplicates == 0:
		printer.prt("\nDuplicate analysis: No duplicates found!")
	else:
		empty_line = [""] * 5
		duplicates.append(empty_line)

		total_duplicate_percent = float(total_number_of_duplicates) / total_entries
		total_line = [total_line_name]
		total_line.extend([
			"{:,}".format(x) for x in
			[total_entries, total_entries - total_number_of_duplicates, total_number_of_duplicates]
		])
		total_line.append(util.fmtr.format_percentage(total_duplicate_percent, True, 3))
		duplicates.append(total_line)

		util.outp.print_table(duplicates, headline="Duplicates per app ID", printer=printer)

	printer.prt("\nScores for %s scorable app ids: Dispersion index = %s | Duplicate index = %s"
		% (len(scorable_app_ids), round(dispersion_index, 3), round(duplicate_index, 3))
	)
	printer.prt("Scorable app ids: %s" % scorable_app_ids)

	if to_file:
		with open(output_path, "w") as output_file:
			for line in printer.get_messages():
				output_file.write(line + "\n")

		output_printer.prt("Successfully saved analysis to \"{}\".".format(output_path))

	# harmonious? all labelled / some / none?
	# for each app id: are there roughly the same number of entries per class?
	return
Esempio n. 10
0
    def store_experiment(self):
        """ Store the results saved in this class in our experiment directory. """

        self.end_time = time.time()
        self.storer_printer.prt("Storing experiment results...")

        Dir.ensure_folder_exists(self.experiment_dir_path)

        entry_file_path = os.path.join(self.experiment_dir_path,
                                       "used_entries")
        result_file_path = os.path.join(self.experiment_dir_path, "result")
        stdout_file_path = os.path.join(self.experiment_dir_path, "stdout")
        classifiers_file_path = os.path.join(self.experiment_dir_path,
                                             "classifiers")
        file_paths = [
            entry_file_path, result_file_path, stdout_file_path,
            classifiers_file_path
        ]
        other_result_files_paths = []
        for file_name, _ in self.other_result_files:
            oth_res_path_creation = os.path.join(self.experiment_dir_path,
                                                 file_name)
            oth_res_path_creation = Dir.uniquify(oth_res_path_creation)
            other_result_files_paths.append(oth_res_path_creation)

        if any([
                os.path.lexists(x)
                for x in file_paths + other_result_files_paths
        ]):
            raise IOError("One of the files exists: %s" %
                          (file_paths + other_result_files_paths))

        self.storer_printer.prt("Data verified. Storing utilised entries...")

        # Create new file with my entries
        saved_path = idse_dao.save_entries(entry_file_path, self.entries)

        self.storer_printer.prt("Done. Analysing file...")

        # Analyse that file
        log_file_analysis.analyse(saved_path,
                                  to_file=True,
                                  output_printer=util.prtr.Storer())

        self.storer_printer.prt("Done. Saving classifiers...")

        # Save trained classifiers
        classifier_lines = self.create_classifier_lines()
        Dir.write_lines(classifiers_file_path, classifier_lines)

        self.storer_printer.prt("Done. Saving result digest...")

        # Save the result
        result_lines = self.create_result_lines()
        Dir.write_lines(result_file_path, result_lines)

        if self.other_result_files:
            for oth_res_path, (oth_res_name,
                               oth_res_lines) in zip(other_result_files_paths,
                                                     self.other_result_files):
                self.storer_printer.prt("Saving others: %s..." % oth_res_name)
                Dir.write_lines(oth_res_path, oth_res_lines)

        self.storer_printer.prt("Done!")
        self.storer_printer.prt("Experiment stored in: %s" %
                                self.experiment_dir_path)

        # Save the stdout (tee replacement)
        stdout_lines = self.storer_printer.get_messages()
        Dir.write_lines(stdout_file_path, stdout_lines)
Esempio n. 11
0
def detect_type(file_path):
    """ Detect the file type of the file. """

    first_line = Dir.yield_lines(file_path).next()
    return _detect_type(first_line)
Esempio n. 12
0
def _yield_log_entries_from_file(file_path):
    for line in Dir.yield_lines(file_path):
        yield LogEntry.from_log_string(line)
Esempio n. 13
0
    print(msg)
    exit()


### Main program ###

p = argparse.ArgumentParser()
p.add_argument("file_path", metavar="PATH/FILE", help="Log file")
args = p.parse_args()

orig_path = os.path.expanduser(args.file_path)

if not os.path.lexists(orig_path):
    prexit("File doesn't exist")

tmp_path = orig_path + "_bak"

if os.path.lexists(tmp_path):
    prexit("%s exists" % tmp_path)

os.rename(orig_path, tmp_path)

with open(orig_path, "w") as output_file:
    for line in Dir.yield_lines(tmp_path):
        processed_string = proc_log_string(line)
        output_file.write("%s\n" % processed_string)

print("Done.")
print("Wrote to: %s" % orig_path)
print("Old file: %s" % tmp_path)