def process_as_vector(ctx, trees, representatives): if len(trees) == 0 and len(representatives) == 0 and ctx.obj.get( "use_input", False): structure = ctx.obj.get("structure", None) file_path = structure.input_file_path() with open(file_path, "r") as input_file: # expecting dictionary containing lists for trees and representatives data = json.load(input_file).get("data") trees = data.get("trees", []) representatives = data.get("representatives", []) results = _init_results() results["files"] = trees results["prototypes"] = representatives tree_paths = _get_input_files(trees, minimum=ctx.obj["start"], maxlen=ctx.obj["maximum"]) prototype_paths = _get_input_files(representatives) # build prototypes prototypes = _initialise_prototypes(prototype_paths) def path_generator(): for path in tree_paths: yield (path, len(prototypes)) results["results"] = _process_configurations( prototypes=prototypes, configurations=ctx.obj["configurations"], event_generator=path_generator) output_results(ctx=ctx, results=results, version=determine_version(os.path.dirname(assess.__file__)), source="%s (%s)" % (__file__, "process_as_vector"), file_type=ctx.obj.get("file_type", None))
def index_valid_trees(ctx, paths, pcount): """ Method walks the given paths and reads all trees that are found within. For each tree that can successfully be read, it is appended to the results list. This list can be used for further processing. :param ctx: Click context :param paths: The paths to scan for valid tree data """ results = [] filenames = [] for path in paths: filenames.extend(_relevant_files_for_context(ctx, path)) if pcount > 1: results.extend( do_multicore(count=pcount, target=_valid_tree, data=filenames)) else: for filename in filenames: result = _valid_tree(filename) if result is not None: results.append(result) output_results(ctx=ctx, results=results, version=determine_version( os.path.dirname(assess_workflows.__file__)), source="%s (%s)" % (__file__, "index_valid_trees"))
def transform_matrix_to_adjacency_list(ctx, symmetric): if ctx.obj.get("use_input", False): ctx.obj["json"] = True results = {} structure = ctx.obj.get("structure", None) file_path = structure.input_file_path() with open(file_path, "r") as input_file: input_data = json.load(input_file).get("data", None) files = input_data["files"] for result_idx, result in enumerate(input_data["results"][0]): decorator = result["decorator"]["normalized_matrix"] for row_idx, row in enumerate(decorator): for col_idx, col in enumerate(row[0]): if col_idx == row_idx: continue results.setdefault( files[result_idx][row_idx], {})[files[result_idx][col_idx]] = col if symmetric: results.setdefault( files[result_idx][col_idx], {})[files[result_idx][row_idx]] = col output_results(ctx=ctx, results=results, version=determine_version( os.path.dirname(assess_workflows.__file__)), source="%s (%s)" % (__file__, "transform_matrix_to_adjacency_list"))
def transform_matrix_to_sql(ctx): if ctx.obj.get("use_input", False): result = "INSERT INTO object_distances (a, b, d) VALUES\n" structure = ctx.obj.get("structure", None) file_path = structure.input_file_path() with open(file_path, "r") as input_file: input_data = json.load(input_file).get("data", None) # contains list of lists # each list is a row within the matrix, inside the matrix, everything above the diagonal # including diagonal is 0, so results can be skipped here data = input_data["results"][0]["decorator"]["normalized_matrix"] for row_index in range(0, len(data)): for column_index in range(0, row_index): if len(result) > 46: # length of INSERT INTO... result += ",\n" result += "(%d,%d,%s)" % (row_index, column_index, data[row_index][column_index]) output_results(ctx=ctx, results=result + ";", version=determine_version( os.path.dirname(assess_workflows.__file__)), source="%s (%s)" % (__file__, "transform_matrix_to_sql"))
def analyse_diamonds(ctx, pcount): """ Method returns output file that follows the following format: { node_count: { p_value: { "raw": [[diamond levels], ...], "identities": [identity_count, ...], "diamonds": [diamond_count, ...], "files": [file_path, ...] } } } :param ctx: :param pcount: :return: """ results = MulticoreResult() ctx.obj["json"] = True if ctx.obj.get("use_input", False): structure = ctx.obj.get("structure", None) file_path = structure.input_file_path() signature_builders = ctx.obj.get("configurations", [{}])[0].get("signatures", []) with open(file_path, "r") as input_file: analysis_files = json.load(input_file).get("data", None) if pcount > 1: data = [{ "node_count": node_count, "filepath": tree_path[0], "signature_builders": signature_builders } for node_count, tree_paths in analysis_files.items() for tree_path in tree_paths] multicore_results = do_multicore(count=pcount, target=_analyse_diamonds, data=data) for result in multicore_results: results += result else: for node_count, tree_paths in analysis_files.items(): for tree_path in tree_paths: results += _analyse_diamonds({ "node_count": node_count, "filepath": tree_path[0], "signature_builders": signature_builders }) output_results(ctx=ctx, results=results, version=determine_version( os.path.dirname(assess_workflows.__file__)), source="%s (%s)" % (__file__, "analyse_diamonds"))
def perform_clustering(ctx, eta, epsilon): results = {} if ctx.obj.get("use_input", False): configuration = ctx.obj.get("configurations", None)[0] signature = configuration.get("signatures", [None])[0] distance = configuration.get("distances", [None])[0] structure = ctx.obj.get("structure", None) file_path = structure.input_file_path() tree_builder = CSVTreeBuilder() clustering = Clustering(distance=distance, cluster_distance=epsilon, core_neighbours=eta) with open(file_path, "r") as input_file: input_data = json.load(input_file).get("data", None) for sample in input_data.get("samples", []): tree = tree_builder.build(sample[0]) # convert tree to index tree_index = tree.to_index( signature=signature, start_support=distance.supported.get( ProcessStartEvent, False), exit_support=distance.supported.get( ProcessExitEvent, False), traffic_support=distance.supported.get( TrafficEvent, False)) clustering[sample[0]] = tree_index print("---> performed clustering with eta %s and epsilon %s" % (eta, epsilon)) results.setdefault( "meta", {})["algorithm"] = clustering.clusterer.__class__.__name__ results.setdefault("meta", {})["eta"] = eta results.setdefault("meta", {})["epsilon"] = epsilon for cluster in clustering: results.setdefault("clusters", []).append([node.key for node in cluster ]) # TODO: determine CR for noise in clustering.clusterer.noise: results.setdefault("noise", []).append(noise.key) for score in [ silhouette_score, calinski_harabasz_score, davies_bouldin_score ]: try: the_score = score(clustering.clusterer.clusters, clustering.clusterer.graph) except ValueError: the_score = None results.setdefault("scores", {})[score.__name__] = the_score output_results(ctx=ctx, results=results, version=determine_version( os.path.dirname(assess_workflows.__file__)), source="%s (%s)" % (__file__, "perform_clustering"))
def perform_precalculated_clustering(ctx, eta, epsilon): results = {} if ctx.obj.get("use_input", False): configuration = ctx.obj.get("configurations", None)[0] distance = configuration.get("distances", [None])[0] structure = ctx.obj.get("structure", None) file_path = structure.input_file_path( file_type="csv") # expecting csv file graph = _create_graph(ctx, file_path) for single_eta in eta: for single_epsilon in epsilon: start = time.time() clustering = DenGraphIO(base_graph=graph, cluster_distance=single_epsilon, core_neighbours=single_eta) end = time.time() cluster_distance = ClusterDistance(distance=distance()) clustering.graph.distance = cluster_distance print( "---> performed clustering with eta %s and epsilon %s in %s" % (single_eta, single_epsilon, end - start)) results.setdefault("results", []).append({}) current_result = results["results"][-1] current_result.setdefault( "meta", {})["algorithm"] = clustering.__class__.__name__ current_result.setdefault("meta", {})["eta"] = single_eta current_result.setdefault("meta", {})["epsilon"] = single_epsilon current_result["duration"] = end - start for cluster_idx, cluster in enumerate(clustering): current_result.setdefault("clusters", []).append( [node.key for node in cluster]) # TODO: determine CR print("[cluster %s] %s" % (cluster_idx, len(cluster))) print("[noise] %s" % len(clustering.noise)) for noise in clustering.noise: current_result.setdefault("noise", []).append(noise.key) # for score in [silhouette_score, calinski_harabasz_score, davies_bouldin_score]: # for score in [silhouette_score]: for score in []: try: the_score = score(clustering.clusters, clustering.graph) except ValueError: the_score = None current_result.setdefault("scores", {})[score.__name__] = the_score print("Got a %s of %s" % (score.__name__, the_score)) output_results( ctx=ctx, results=results, version=determine_version(os.path.dirname(assess_workflows.__file__)), source="%s (%s)" % (__file__, "perform_precalculated_clustering"))
def index_data_by_number_of_payloads(ctx, paths): results = {} for path in paths: for filename in glob.glob("%s/*/*-process.csv" % path): count = _payload_line_count(filename) results.setdefault(count, []).append(filename) output_results( ctx=ctx, results=results, version=determine_version(os.path.dirname(assess_workflows.__file__)), source="%s (%s)" % (__file__, "index_data_by_number_of_payloads"))
def full_statistics(ctx, pcount): """ Method prepares full statistics about a dataset. The output is as follows: { <filename>: { "node_count": <int>, # number of nodes in tree "complete_node_count": <int>, # number of nodes in tree w attributes "nodes_with_attribute_count": <int>, # number of nodes that contain attributes "alphabet_count": <int>, # alphabet count "duration": <int>, # duration of tree "fanout": [<int>, ...], # fanout of nodes "complete_fanout": [<int>, ...] # fanout of nodes w attributes "depth": [<int>, ...], # depth in tree for leaves "complete_depth": [<int>, ...], # depth in tree for leaves w attributes "attribute_event_count": [<int>, ...] # events for attributes per node } } :param ctx: :param pcount: :return: """ results = MulticoreResult() ctx.obj["json"] = True if ctx.obj.get("use_input", False): structure = ctx.obj.get("structure", None) file_path = structure.input_file_path() with open(file_path, "r") as input_file: analysis_files = json.load(input_file).get("data", None) data = [] for node_count, tree_paths in analysis_files.items(): for tree_path in tree_paths: if isinstance(tree_path, list): for path in tree_path: data.append({"filepath": path}) else: data.append({"filepath": tree_path}) if pcount > 1: multicore_result = do_multicore(count=pcount, target=_full_statistics, data=data) for result in multicore_result: results += result else: for elem in data: results += _full_statistics(elem) output_results(ctx=ctx, results=results, version=determine_version( os.path.dirname(assess_workflows.__file__)), source="%s (%s)" % (__file__, "full_statistics"))
def validate_representatives(ctx, eta, epsilon, threshold): """ Method performs a clustering and from this resulting clusters, calculates the cluster representatives to measure our current current representation. For validation of the cluster representatives we evaluate the distance of each clustered object to the calculated cluster representative. Whenever the distance is bigger than the given epsilon for clustering, we have a bad representation. :param ctx: :return: """ if ctx.obj.get("use_input", False): configuration = ctx.obj.get("configurations", None)[0] distance_cls = configuration.get("distances", [None])[0] structure = ctx.obj.get("structure", None) file_path = structure.input_file_path( file_type="csv") # expecting csv file graph = _create_graph(ctx, file_path) clustering = DenGraphIO(base_graph=graph, cluster_distance=epsilon, core_neighbours=eta) cluster_distance = ClusterDistance(distance=distance_cls(), threshold=threshold) clustering.graph.distance = cluster_distance results = { "meta": { "eta": eta, "epsilon": epsilon, "threshold": threshold }, "clusters": {} } # calculate CRs from clusters for cluster_index, cluster in enumerate(clustering): cluster_representative = cluster_distance.mean(list(cluster)) for tree_object in cluster: # calculate distance to cluster representative distance = cluster_distance(cluster_representative, tree_object) results["clusters"].setdefault(cluster_index, {}).setdefault( "tree", []).append(tree_object.key) results["clusters"].setdefault(cluster_index, {}).setdefault( "distance", []).append(distance) output_results(ctx=ctx, results=results, version=determine_version( os.path.dirname(assess_workflows.__file__)), source="%s (%s)" % (__file__, "validate_representatives"))
def aggregate_samples(ctx, skip_key): """ Method aggregates nested dictionaries into a flat one. If it already is a flat dictionary, than data is kept and written. :param ctx: :return: """ results = {} if ctx.obj.get("use_input", False): structure = ctx.obj.get("structure", None) file_path = structure.input_file_path() with open(file_path, "r") as input_file: input_data = json.load(input_file).get("data", None) for key, values in input_data.items(): if isinstance(values, list): results.setdefault(key if not skip_key else "samples", []).extend(values) else: try: if len(values[0]) > 1: # flattening data results.setdefault(key, []).append([ element for value in values[0] for element in value ]) else: # data can be kept results.setdefault(key, []).append(values[0]) except KeyError: to_check = [values] while to_check: current_item = to_check.pop(0) try: while current_item: _, value = current_item.popitem() to_check.append(value) except KeyError: results.setdefault(key, []).append(current_item) except AttributeError: results.setdefault(key, []).extend(current_item) output_results(ctx=ctx, results=results, version=determine_version( os.path.dirname(assess_workflows.__file__)), source="%s (%s)" % (__file__, "aggregate_samples"))
def batch_process_from_pkl(ctx, pcount, reverse): results = _init_results() results["distance"] = [] results["prototypes"] = [] if ctx.obj.get("use_input", False): structure = ctx.obj.get("structure", None) file_path = structure.input_file_path(file_type="pkl") with open(file_path, "r") as input_file: data = [] # HEADER # ###### # results are split into a header and data files # see data_generation_cli.generate_perturbated_tree tree_metadata = pickle.load(input_file) results["files"] = tree_metadata.keys() for key, pkl_path in tree_metadata.items(): # tree is stored in "tree" # distorted trees in "perturbated_tree" data.append({ "data_pkl_path": pkl_path, "data_pkl_key": key, "configurations": ctx.obj["configurations"], "reverse": reverse }) if pcount > 1: result_list = (do_multicore( count=pcount, data=data, target=_process_configurations_for_row)) for result in result_list: results["results"].append(result['results']) results["distance"].append(result['precalculated_costs']) results["prototypes"].append(result["prototypes"]) else: for elem in data: result = _process_configurations_for_row(elem) results["results"].append(result["results"]) results["distance"].append(result["precalculated_costs"]) results["prototypes"].append(result["prototypes"]) output_results(ctx=ctx, results=results, version=determine_version(os.path.dirname(assess.__file__)), source="%s (%s)" % (__file__, "batch_process_from_pkl"))
def index_tree_statistics(ctx, paths, pcount): filenames = [] results = MulticoreResult() for path in paths: filenames.extend(_relevant_files_for_context(ctx, path)) if pcount > 1: result_list = do_multicore(count=pcount, target=_tree_statistics, data=filenames) for result in result_list: results += result else: for filename in filenames: results += _tree_statistics(filename) output_results(ctx=ctx, results=results, version=determine_version( os.path.dirname(assess_workflows.__file__)), source="%s (%s)" % (__file__, "index_tree_statistics"))
def index_process_names(ctx, paths, pcount): filenames = [] result_set = set() for path in paths: filenames.extend(_relevant_files_for_context(ctx, path)) if pcount > 1: result_list = do_multicore(count=pcount, target=_process_names, data=filenames) for result in result_list: result_set.union(result) else: for filename in filenames: result_set.union(_process_names(filename)) output_results(ctx=ctx, results={"process_names": [name for name in result_set]}, version=determine_version( os.path.dirname(assess_workflows.__file__)), source="%s (%s)" % (__file__, "index_process_names"))
def transform_mapping_to_sql(ctx, paths): if ctx.obj.get("use_input", False) or len(paths) > 0: result = "" update_cmd = "update payload_result set payload_id='%s' where id=%s;\n" for path in paths: print("starting with %s" % path) with open(path, "r") as input_file: reader = csv.reader(input_file, quotechar="'") for row in reader: result += update_cmd % (re.match( "([\d-]+)", row[1]).group(), re.match("(\d+)", row[0]).group()) output_results(ctx=ctx, results=result + ";", version=determine_version( os.path.dirname(assess_workflows.__file__)), source="%s (%s)" % (__file__, "transform_mapping_to_sql"))
def subset_data(ctx, include_key): """ Example Usage of this method: include_key="'lambda key, value: int(key) > 25'" :param ctx: :param include_key: :return: """ results = {} if ctx.obj.get("use_input", False): include_key = eval(include_key) structure = ctx.obj.get("structure", None) file_path = structure.input_file_path() with open(file_path, "r") as input_file: input_data = json.load(input_file).get("data", None) for key, value in input_data.items(): if isinstance(value, dict): for inner_key, inner_value in value.items(): if isinstance(inner_value, dict): for inner_inner_key, inner_inner_value in inner_value.items( ): if include_key(inner_inner_key, inner_inner_value): results.setdefault(key, {}).setdefault( inner_key, {} )[inner_inner_key] = inner_inner_value else: if include_key(inner_key, inner_value): results.setdefault(key, {})[inner_key] = inner_value else: if include_key(key, value): results[key] = value output_results(ctx=ctx, results=results, version=determine_version( os.path.dirname(assess_workflows.__file__)), source="%s (%s)" % (__file__, "subset_data"))
def pick_samples(ctx, seed, repeat, count, minimum, skip_key): results = {} if seed is not None: random.seed(seed) if ctx.obj.get("use_input", False): structure = ctx.obj.get("structure", None) file_path = structure.input_file_path() with open(file_path, "r") as input_file: input_data = json.load(input_file).get("data", None) working_data = input_data try: if skip_key: working_data = { value for values in input_data.values() for value in values } for key, values in working_data.items(): try: for _ in range(repeat): results.setdefault(key, []).append( random.sample(values, count)) except ValueError: if minimum == 0 or minimum <= len(values): results.setdefault(key, []).append(values) else: continue except AttributeError: key = "samples" for _ in range(repeat): results.setdefault(key, []).append( random.sample(working_data, count)) output_results(ctx=ctx, results=results, version=determine_version( os.path.dirname(assess_workflows.__file__)), source="%s (%s)" % (__file__, "pick_samples"))
def index_data_by_number_of_events(ctx, paths): results = {} filenames = [] for path in paths: filenames.extend(_relevant_files_for_context(ctx, path)) for filename in filenames: basename = os.path.basename(filename) db_id = basename.split("-")[0] # access process file count = _line_count( filename=filename) * 2 # start and finishing of a process # access traffic file (if existent) traffic_count = _line_count( filename=os.path.join(os.path.dirname(filename), "%s-traffic.csv" % db_id)) results.setdefault(count + traffic_count, []).append(filename) output_results( ctx=ctx, results=results, version=determine_version(os.path.dirname(assess_workflows.__file__)), source="%s (%s)" % (__file__, "index_data_by_number_of_events"))
def analyse_diamond_perturbations(ctx, pcount): results = MulticoreResult() ctx.obj["json"] = True if ctx.obj.get("use_input", False): structure = ctx.obj.get("structure", None) file_path = structure.input_file_path() signature_builders = ctx.obj.get("configurations", [{}])[0].get("signatures", []) with open(file_path, "r") as input_file: analysis_files = json.load(input_file).get("data", None) if pcount > 1: # combine data data = [{ "filepath": path[0], "signature_builders": signature_builders } for paths in analysis_files.values() for path in paths] multicore_results = do_multicore( count=pcount, target=_analyse_diamond_perturbation, data=data) for result in multicore_results: results += result else: for tree_paths in analysis_files.values(): for tree_path in tree_paths: results += _analyse_diamond_perturbation({ "filepath": tree_path[0], "signature_builders": signature_builders }) output_results( ctx=ctx, results=results, version=determine_version(os.path.dirname(assess_workflows.__file__)), source="%s (%s)" % (__file__, "analyse_diamond_perturbation"))
def analyse_duration(ctx, pcount): """ Method prepares duration data for further analysis. :param ctx: :param pcount: :return: """ results = MulticoreResult() ctx.obj["json"] = True if ctx.obj.get("use_input", False): structure = ctx.obj.get("structure", None) file_path = structure.input_file_path() with open(file_path, "r") as input_file: analysis_files = json.load(input_file).get("data", None) data = [] for node_count, tree_paths in analysis_files.items(): for tree_path in tree_paths: if isinstance(tree_path, list): for path in tree_path: data.append({"filepath": path}) else: data.append({"filepath": tree_path}) if pcount > 1: multicore_result = do_multicore(count=pcount, target=_analyse_duration, data=data) for result in multicore_result: results += result else: for elem in data: results += _analyse_duration(elem) output_results(ctx=ctx, results=results, version=determine_version( os.path.dirname(assess_workflows.__file__)), source="%s (%s)" % (__file__, "analyse_duration"))
def index_valid_hdf_trees(ctx, trees, representatives, pcount): structure = ctx.obj.get("structure", None) results = {} paths = [(key, value) for key, values in { "trees": trees, "representatives": representatives }.items() for value in values if values] if pcount > 1: trees = do_multicore(count=pcount, target=_valid_hdf_tree, data=paths) for category, tree, name in trees: results.setdefault(category, []).append( _write_tree_to_pkl(structure, tree, name)) else: for filename in paths: trees = _valid_hdf_tree(filename) for category, tree, name in trees: results.setdefault(category, []).append( _write_tree_to_pkl(structure, tree, name)) output_results(ctx=ctx, results=results, version=determine_version( os.path.dirname(assess_workflows.__file__)), source="%s (%s)" % (__file__, "index_valid_hdf_trees"))
def index_data_by_activity(ctx): results = {} with SQLCommand(providerName="PostgresDBProvider", connectionString="dbname=gnm user=gnm") as sql_command: fields = [ "payload_id", "activity", "task_monitor_id", "status_name", "workernode.name", "job.run" ] sql_results = sql_command.execute( "select %s from payload_result " "inner join workernode on payload_result.workernode_id=workernode.id " "inner join payload on payload_result.payload_id=payload.id " "inner join job on job.id=payload.job_id " "where payload_id!=%%s and" "(activity=%%s or activity=%%s or activity=%%s or activity=%%s) and " "(status_name=%%s or status_name=%%s or status_name=%%s or status_name=%%s)" % ",".join(fields), [ "", "reprocessing", "production", "analysis", "analysis-crab3", "SUCCEEDED", "FAILED", "DONE", "ABORTED" ]) for sql_result in sql_results: result = dict(zip(fields, sql_result)) current_result = results.setdefault(result["activity"], {}) current_result.setdefault(result["status_name"], {}).setdefault( result["task_monitor_id"], []).append( os.path.join( "/home/fq8360/data/gnm/payloads", os.path.join( os.path.join(result["workernode.name"], result["job.run"]), "%s-process.csv" % result["payload_id"]))) if ctx.obj.get("save", False): output_results(ctx=ctx, results=results, version=determine_version( os.path.dirname(assess_workflows.__file__)), source="%s (%s)" % (__file__, "index_data_by_activity"))
def transform_matrix_to_csv(ctx, key, has_ensemble): """ has_ensemble means that for different identities values are encoded. :param ctx: :param key: :param has_ensemble: :return: """ if ctx.obj.get("use_input", False): results = "" structure = ctx.obj.get("structure", None) file_path = structure.input_file_path() with open(file_path, "r") as input_file: input_data = json.load(input_file).get("data", None) files = input_data["files"] for result_idx, result in enumerate(input_data["results"][0]): decorator = result["decorator"][key] maximum_index = len(decorator) results += ",".join(files[result_idx]) results += "\n" for row_index in range(0, maximum_index): row = [0 for _ in range(row_index + 1)] for col_index in range(row_index + 1, maximum_index): if has_ensemble: row.append(decorator[col_index][0][row_index]) else: row.append(decorator[col_index][row_index]) results += "%s\n" % ",".join([str(item) for item in row]) output_results(ctx=ctx, results=results, file_type="csv", version=determine_version( os.path.dirname(assess_workflows.__file__)), source="%s (%s)" % (__file__, "transform_matrix_to_csv"))
def index_data_by_number_of_nodes(ctx, paths): results = {} if ctx.obj.get("use_input", False): structure = ctx.obj.get("structure", None) file_path = structure.input_file_path() with open(file_path, "r") as input_file: paths = list(paths) paths.extend(json.load(input_file)["data"]) filenames = [] for path in paths: filenames.extend(_relevant_files_for_context(ctx, path)) for filename in filenames: if not os.path.isfile(path): count = _line_count(filename) results.setdefault(count, []).append(filename) else: count = _line_count(path) results.setdefault(count, []).append(path) output_results( ctx=ctx, results=results, version=determine_version(os.path.dirname(assess_workflows.__file__)), source="%s (%s)" % (__file__, "index_data_by_number_of_nodes"))
def analyse_metric(ctx): """ Method analyses the mean relative deviation for given distance functions to determine if those might be metrics, pseudo-metrics, etc. :param ctx: """ results = "" latex_results = "" if ctx.obj.get("use_input", False): structure = ctx.obj.get("structure", None) file_path = structure.input_file_path() data = {} with open(file_path, "r") as input_file: analysis_data = json.load(input_file).get("data", None) for result_idx, result in enumerate( analysis_data.get("results", [])): for result_entry in result: algorithm = result_entry.get("algorithm", None) decorator_data = result_entry.get("decorator", {}).get("matrix", []) tree_sizes = result_entry.get("decorator", {}).get( "data", [])["prototypes"]["original"][0] diagonal_issues = other_issues = 0 diagonal_values = [] other_values = [] for row_idx, row_data in enumerate(decorator_data): for col_idx in range(row_idx, len(decorator_data)): if row_idx == col_idx: value = row_data[0][col_idx] # we got the diagonal, so check if value is None: continue diagonal_values.append(( value, tree_sizes[row_idx] * 2, value, tree_sizes[col_idx] * 2, )) if value != 0: diagonal_issues += 1 else: # we got each other value, so check left_value = row_data[0][col_idx] right_value = decorator_data[col_idx][0][ row_idx] if left_value is None or right_value is None: continue other_values.append(( left_value, tree_sizes[row_idx] * 2, right_value, tree_sizes[col_idx] * 2, )) if left_value != right_value: other_issues += 1 current_algorithm = data.setdefault(algorithm, {}) try: current_algorithm["diagonal"].extend(diagonal_values) current_algorithm["other"].extend(other_values) except KeyError: current_algorithm["diagonal"] = diagonal_values current_algorithm["other"] = other_values for key, values in data.items(): statistics_algorithm = re.search("cache_statistics=(\w+)", key).group(1) statistics_variant = int( float(re.search("weight=(\d\.\d)", key).group(1)) * 10) diagonal_mean, diagonal_error = uncorrelated_relative_deviation_and_standard_error( values["diagonal"], 0) other_mean, other_error = uncorrelated_relative_deviation_and_standard_error( values["other"]) # save results latex_results += "\\def\%srelativediagonalmean%s{%s}\n" % ( statistics_algorithm, chr(statistics_variant + 97), diagonal_mean) latex_results += "\\def\%srelativediagonalssd%s{%s}\n" % ( statistics_algorithm, chr(statistics_variant + 97), diagonal_error) latex_results += "\\def\%srelativemean%s{%s}\n" % ( statistics_algorithm, chr(statistics_variant + 97), other_mean) latex_results += "\\def\%srelativessd%s{%s}\n" % ( statistics_algorithm, chr(statistics_variant + 97), other_error) output_results(ctx=ctx, results=latex_results, version=determine_version( os.path.dirname(assess_workflows.__file__)), source="%s (%s)" % (__file__, "analyse_metric"), file_type="tex", comment_sign="%")
def analyse_compression(ctx, pcount): """ Method prepares data for further compression analysis. Thus, it collects information on * number of nodes in original tree * height of tree as an optional information * size of the alphabet (optimised by excluding id numbers in names) * number of unique identities generated * statistics on the trees fanout The following output format can be expected <number of nodes>: { "file": [<string>, ...], "alphabet_count": [<int>, ...], "tree_height": [<int>, ...], "identity_count": { <Signature>: [<int>, ...] }, "fanout": { "min": [<int>, ...], "max": [<int>, ...], "mean": [<float>, ...], "std": [<float>, ...], "full": [[<int>, ...], ...] } } :param ctx: :param pcount: :return: """ results = MulticoreResult() ctx.obj["json"] = True if ctx.obj.get("use_input", False): structure = ctx.obj.get("structure", None) file_path = structure.input_file_path() signature_builders = ctx.obj.get("configurations", [{}])[0].get("signatures", []) with open(file_path, "r") as input_file: analysis_files = json.load(input_file).get("data", None) data = [] for node_count, tree_paths in analysis_files.items(): for tree_path in tree_paths: for path in tree_path: data.append({ "node_count": node_count, "filepath": path, "signature_builders": signature_builders }) if pcount > 1: multicore_results = do_multicore(count=pcount, target=_analyse_compression, data=data) for result in multicore_results: results += result else: for elem in data: results += _analyse_compression(elem) output_results(ctx=ctx, results=results, version=determine_version( os.path.dirname(assess_workflows.__file__)), source="%s (%s)" % (__file__, "analyse_compression"))
def squish_index_into_ranges(ctx, range_width, maximum_chain): results = {} if ctx.obj.get("use_input", False): def probability_function(one, two): return abs(one - two) / (max(one, two) * range_width) structure = ctx.obj.get("structure", None) file_path = structure.input_file_path() with open(file_path, "r") as input_file: input_data = json.load(input_file)["data"] # number of nodes, events, tmes keys = [int(key) for key in input_data.keys()] keys.sort() # indizes of numbers overlapping within probability_function probabilities = [ index for index, probability in enumerate( probability_function(one, two) for one, two in zip(keys, keys[1:])) if probability <= 1 ] # pick ranges from sequences of overlapping numbers while probabilities: index = probabilities[0] # check all indexes that belong to a chain key_chain = [keys[index]] merged = {keys[index]: input_data.pop(str(keys[index]))} while index in probabilities: probabilities.pop(0) index += 1 try: merged[keys[index]] = input_data.pop(str(keys[index])) key_chain.append(keys[index]) except KeyError: # TODO: is this still needed? continue # split chain if it is too long key_collection = [key_chain] while key_collection: current_key = key_collection.pop() if len(current_key) > maximum_chain: if len(current_key) % maximum_chain == 0: split_point = maximum_chain else: split_point = int( len(current_key) / (len(current_key) // maximum_chain + 1)) while current_key: key_collection.append(current_key[:split_point]) current_key = current_key[split_point:] else: mean = sum(current_key) / len(current_key) merged_files = [merged.pop(key) for key in current_key] input_data[str(mean)] = [ value for values in merged_files for value in values ] results = input_data output_results(ctx=ctx, results=results, version=determine_version( os.path.dirname(assess_workflows.__file__)), source="%s (%s)" % (__file__, "squish_index_into_ranges"))
def batch_process_as_vector(ctx, pcount): results = [] if ctx.obj.get("use_input", False): structure = ctx.obj.get("structure", None) file_path = structure.input_file_path() with open(file_path, "r") as input_file: input_data = json.load(input_file).get("data") data = [] if "trees" in input_data and "representatives" in input_data: trees = input_data.get("trees", []) prototypes = input_data.get("representatives", []) if len(trees) >= len(prototypes): for tree in trees: data.append({ "configurations": ctx.obj["configurations"], "files": [tree], "prototypes": prototypes }) else: for prototype in prototypes: data.append({ "configurations": ctx.obj["configurations"], "files": trees, "prototypes": [prototype] }) else: for key, values in input_data.items(): for value in values: if len(value) == 1: # element is file and prototype at the same time value.append(value[0]) data.append({ "configurations": ctx.obj["configurations"], "files": value[:1], "prototypes": value[1:], "key": key }) if pcount > 1: final_decorators = [] row_idx = col_idx = -1 result_list = do_multicore(pcount, _batch_process_as_vector, data) for result_entry in result_list: if len(trees) >= len(prototypes): row_idx += 1 if col_idx < 0: col_idx = 0 else: col_idx += 1 if row_idx < 0: row_idx = 0 current_results = result_entry.get("results", []) for decorator_key in current_results[0].get( "decorator", {}): for index, current_result in enumerate( current_results): try: # if decorator already exists, we only need to add current data decorator = final_decorators[index][ decorator_key] current_decorator = type(decorator)() current_decorator._data = current_result.get( "decorator", {})[decorator_key] current_decorator.row_idx = [row_idx] current_decorator.col_idx = [col_idx] decorator += current_decorator except IndexError: # if decorator does not exist, we load it and will later add data decorator = Decorator.from_name(decorator_key) decorator._data = current_result.get( "decorator", {})[decorator_key] decorator.row_idx = [row_idx] decorator.col_idx = [col_idx] try: final_decorators[index].setdefault( decorator_key, decorator) except IndexError: final_decorators.append( {decorator_key: decorator}) finals = result_list[0] finals["files"] = trees finals["prototypes"] = prototypes for index, final in enumerate(finals.get("results", [])): for value in final_decorators[index].values(): data = value.descriptive_data() final.get("decorator", {})[list( data.keys())[0]] = list(data.values())[0] results.append(finals) else: for elem in data: results.append(_batch_process_as_vector(elem)) output_results(ctx=ctx, results=results, version=determine_version(os.path.dirname(assess.__file__)), source="%s (%s)" % (__file__, "batch_process_as_vector"), file_type=ctx.obj.get("file_type", None))
def perform_classification(ctx, eta, epsilon): """ Method performs a classification. Before the actual classification can be tested, a clustering is applied. Those clusters are following used for classification. :param ctx: :return: """ if ctx.obj.get("use_input", False): configuration = ctx.obj.get("configurations", None)[0] distance_cls = configuration.get("distances", [None])[0] structure = ctx.obj.get("structure", None) file_path = structure.input_file_path( file_type="csv") # expecting csv file graph = _create_graph(ctx, file_path) clustering = DenGraphIO(base_graph=graph, cluster_distance=epsilon, core_neighbours=eta) cluster_distance = ClusterDistance(distance=distance_cls(), threshold=0) clustering.graph.distance = cluster_distance # calculate CRs from clusters prototype_caches = [] cluster_names = [] for cluster in clustering: for core in cluster.core_nodes: cluster_names.append(core.key) prototype_caches.append(core) # abusing the mean prototype_caches = cluster_distance.mean(prototype_caches, prototype=cluster_names) results = {"files": [], "prototypes": cluster_names[:], "results": []} decorator_def = configuration.get("decorator", None) for algorithm_def in configuration.get("algorithms", []): for signature_def in configuration.get("signatures", []): for event_streamer in configuration.get( "event_streamer", [GNMCSVEventStreamer]): signature = signature_def() algorithm = algorithm_def(signature=signature) algorithm.cluster_representatives( signature_prototypes=prototype_caches, prototypes=cluster_names) decorator = decorator_def() decorator.wrap_algorithm(algorithm=algorithm) # starting a new tree not to mix former results with current for node in clustering.graph: tree = event_streamer(csv_path=node.key) results["files"].append(node.key) algorithm.start_tree() for event in tree.event_iter(): try: algorithm.add_event(event) except EventNotSupportedException: pass algorithm.finish_tree() if decorator: results["results"].append({ "algorithm": "%s" % algorithm, "signature": "%s" % signature, "event_streamer": "%s" % tree if tree is not None else event_streamer( csv_path=None), "decorator": decorator.descriptive_data() }) output_results(ctx=ctx, results=results, version=determine_version( os.path.dirname(assess_workflows.__file__)), source="%s (%s)" % (__file__, "perform_classification"))
def process_as_matrix(ctx, trees, skip_upper, skip_diagonal, pcount): if len(trees) == 0 and ctx.obj.get("use_input", False): structure = ctx.obj.get("structure", None) file_path = structure.input_file_path() with open(file_path, "r") as input_file: # can be list of lists or flat list data = json.load(input_file).get("data") try: trees = data.values()[0] except AttributeError: trees = data except TypeError: # object is dictionary with trees trees = data.get("trees", []) results = _init_results() results["files"] = results["prototypes"] = trees[:] # if we have a flat list, check, otherwise, just take it if type(trees[0]) == list: tree_paths = trees nested = True else: tree_paths = _get_input_files(trees, minimum=ctx.obj["start"], maxlen=ctx.obj["maximum"]) nested = False if pcount > 1: to_process = [] if nested: to_process = tree_paths else: to_process.append(tree_paths) while to_process: data = [] single_tree_paths = to_process.pop(0) # prepare blocks of data factor = multicore_factor(len(single_tree_paths)) block_size = len(single_tree_paths) / float(factor) assert block_size > 1, "Blocksize is too small for proper parallelisation: %s" % block_size index_value = int(math.ceil(len(single_tree_paths) / block_size)) for row_idx in range(index_value): for col_idx in range(index_value): if skip_upper and col_idx > row_idx: continue row_trees = single_tree_paths[ int(row_idx * block_size):min(int( (row_idx + 1) * block_size), len(single_tree_paths))] col_trees = single_tree_paths[ int(col_idx * block_size):min(int( (col_idx + 1) * block_size), len(single_tree_paths))] data.append({ "tree_paths": row_trees, "prototype_paths": col_trees, "configurations": ctx.obj["configurations"] }) result_list = do_multicore(count=pcount, target=_process_as_matrix, data=data) final_decorators = [] row_idx = 0 col_idx = -1 for result_index, result_entry in enumerate(result_list): # calculate the exact position within matrix to help decorators updating their results col_idx += 1 if col_idx >= ((row_idx + 1) if skip_upper else index_value): row_idx += 1 col_idx = 0 current_results = result_entry.get("results", []) # each of the results has the same configuration of decorators, so we can get one # exemplary list of decorators to process all results for decorator_key in current_results[0].get("decorator", {}): for index, current_result in enumerate(current_results): try: # if decorator already exists, we only need to add current data decorator = final_decorators[index][decorator_key] current_decorator = type(decorator)() current_decorator._data = current_result.get( "decorator", {})[decorator_key] current_decorator.row_idx = [row_idx] current_decorator.col_idx = [col_idx] decorator += current_decorator except (IndexError, KeyError): # if decorator does not exist, we load it and will later add data decorator = Decorator.from_name(decorator_key) decorator._data = current_result.get( "decorator", {})[decorator_key] decorator.row_idx = [row_idx] decorator.col_idx = [col_idx] try: final_decorators[index].setdefault( decorator_key, decorator) except IndexError: final_decorators.append( {decorator_key: decorator}) # format updated data finals = result_list[0] for index, final in enumerate(finals.get("results", [])): for value in final_decorators[index].values(): data = value.descriptive_data() final.get("decorator", {})[list(data.keys())[0]] = list( data.values())[0] results.setdefault("results", []).append(finals["results"]) else: to_process = [] if nested: to_process = tree_paths else: to_process.append(tree_paths) while to_process: single_tree_paths = to_process.pop(0) # build prototypes prototypes = _initialise_prototypes(single_tree_paths) def path_generator(): for tree_index, tree_path in enumerate(single_tree_paths): maxlen = len(single_tree_paths) if skip_upper and skip_diagonal: maxlen = tree_index elif skip_upper: maxlen = tree_index + 1 yield (tree_path, maxlen) results.setdefault("results", []).append( _process_configurations( prototypes=prototypes, configurations=ctx.obj["configurations"], event_generator=path_generator)) output_results(ctx=ctx, results=results, version=os.path.dirname(assess.__file__), source="%s (%s)" % (__file__, "process_as_matrix"), file_type=ctx.obj.get("file_type", None))