Esempio n. 1
0
def process_as_vector(ctx, trees, representatives):
    if len(trees) == 0 and len(representatives) == 0 and ctx.obj.get(
            "use_input", False):
        structure = ctx.obj.get("structure", None)
        file_path = structure.input_file_path()
        with open(file_path, "r") as input_file:
            # expecting dictionary containing lists for trees and representatives
            data = json.load(input_file).get("data")
            trees = data.get("trees", [])
            representatives = data.get("representatives", [])
    results = _init_results()
    results["files"] = trees
    results["prototypes"] = representatives
    tree_paths = _get_input_files(trees,
                                  minimum=ctx.obj["start"],
                                  maxlen=ctx.obj["maximum"])
    prototype_paths = _get_input_files(representatives)

    # build prototypes
    prototypes = _initialise_prototypes(prototype_paths)

    def path_generator():
        for path in tree_paths:
            yield (path, len(prototypes))

    results["results"] = _process_configurations(
        prototypes=prototypes,
        configurations=ctx.obj["configurations"],
        event_generator=path_generator)
    output_results(ctx=ctx,
                   results=results,
                   version=determine_version(os.path.dirname(assess.__file__)),
                   source="%s (%s)" % (__file__, "process_as_vector"),
                   file_type=ctx.obj.get("file_type", None))
def index_valid_trees(ctx, paths, pcount):
    """
    Method walks the given paths and reads all trees that are found within. For each tree that
    can successfully be read, it is appended to the results list. This list can be used for
    further processing.

    :param ctx: Click context
    :param paths: The paths to scan for valid tree data
    """
    results = []
    filenames = []
    for path in paths:
        filenames.extend(_relevant_files_for_context(ctx, path))
    if pcount > 1:
        results.extend(
            do_multicore(count=pcount, target=_valid_tree, data=filenames))
    else:
        for filename in filenames:
            result = _valid_tree(filename)
            if result is not None:
                results.append(result)

    output_results(ctx=ctx,
                   results=results,
                   version=determine_version(
                       os.path.dirname(assess_workflows.__file__)),
                   source="%s (%s)" % (__file__, "index_valid_trees"))
def transform_matrix_to_adjacency_list(ctx, symmetric):
    if ctx.obj.get("use_input", False):
        ctx.obj["json"] = True
        results = {}
        structure = ctx.obj.get("structure", None)
        file_path = structure.input_file_path()

        with open(file_path, "r") as input_file:
            input_data = json.load(input_file).get("data", None)
            files = input_data["files"]
            for result_idx, result in enumerate(input_data["results"][0]):
                decorator = result["decorator"]["normalized_matrix"]
                for row_idx, row in enumerate(decorator):
                    for col_idx, col in enumerate(row[0]):
                        if col_idx == row_idx:
                            continue
                        results.setdefault(
                            files[result_idx][row_idx],
                            {})[files[result_idx][col_idx]] = col
                        if symmetric:
                            results.setdefault(
                                files[result_idx][col_idx],
                                {})[files[result_idx][row_idx]] = col
        output_results(ctx=ctx,
                       results=results,
                       version=determine_version(
                           os.path.dirname(assess_workflows.__file__)),
                       source="%s (%s)" %
                       (__file__, "transform_matrix_to_adjacency_list"))
def transform_matrix_to_sql(ctx):
    if ctx.obj.get("use_input", False):
        result = "INSERT INTO object_distances (a, b, d) VALUES\n"
        structure = ctx.obj.get("structure", None)
        file_path = structure.input_file_path()

        with open(file_path, "r") as input_file:
            input_data = json.load(input_file).get("data", None)

            # contains list of lists
            # each list is a row within the matrix, inside the matrix, everything above the diagonal
            # including diagonal is 0, so results can be skipped here
            data = input_data["results"][0]["decorator"]["normalized_matrix"]
            for row_index in range(0, len(data)):
                for column_index in range(0, row_index):
                    if len(result) > 46:  # length of INSERT INTO...
                        result += ",\n"
                    result += "(%d,%d,%s)" % (row_index, column_index,
                                              data[row_index][column_index])
        output_results(ctx=ctx,
                       results=result + ";",
                       version=determine_version(
                           os.path.dirname(assess_workflows.__file__)),
                       source="%s (%s)" %
                       (__file__, "transform_matrix_to_sql"))
Esempio n. 5
0
def analyse_diamonds(ctx, pcount):
    """
    Method returns output file that follows the following format:

    {
        node_count: {
            p_value: {
                "raw": [[diamond levels], ...],
                "identities": [identity_count, ...],
                "diamonds": [diamond_count, ...],
                "files": [file_path, ...]
            }
        }
    }

    :param ctx:
    :param pcount:
    :return:
    """
    results = MulticoreResult()
    ctx.obj["json"] = True
    if ctx.obj.get("use_input", False):
        structure = ctx.obj.get("structure", None)
        file_path = structure.input_file_path()
        signature_builders = ctx.obj.get("configurations",
                                         [{}])[0].get("signatures", [])

        with open(file_path, "r") as input_file:
            analysis_files = json.load(input_file).get("data", None)
            if pcount > 1:
                data = [{
                    "node_count": node_count,
                    "filepath": tree_path[0],
                    "signature_builders": signature_builders
                } for node_count, tree_paths in analysis_files.items()
                        for tree_path in tree_paths]
                multicore_results = do_multicore(count=pcount,
                                                 target=_analyse_diamonds,
                                                 data=data)
                for result in multicore_results:
                    results += result
            else:
                for node_count, tree_paths in analysis_files.items():
                    for tree_path in tree_paths:
                        results += _analyse_diamonds({
                            "node_count":
                            node_count,
                            "filepath":
                            tree_path[0],
                            "signature_builders":
                            signature_builders
                        })

    output_results(ctx=ctx,
                   results=results,
                   version=determine_version(
                       os.path.dirname(assess_workflows.__file__)),
                   source="%s (%s)" % (__file__, "analyse_diamonds"))
def perform_clustering(ctx, eta, epsilon):
    results = {}

    if ctx.obj.get("use_input", False):
        configuration = ctx.obj.get("configurations", None)[0]
        signature = configuration.get("signatures", [None])[0]
        distance = configuration.get("distances", [None])[0]
        structure = ctx.obj.get("structure", None)
        file_path = structure.input_file_path()

        tree_builder = CSVTreeBuilder()
        clustering = Clustering(distance=distance,
                                cluster_distance=epsilon,
                                core_neighbours=eta)
        with open(file_path, "r") as input_file:
            input_data = json.load(input_file).get("data", None)

            for sample in input_data.get("samples", []):
                tree = tree_builder.build(sample[0])
                # convert tree to index
                tree_index = tree.to_index(
                    signature=signature,
                    start_support=distance.supported.get(
                        ProcessStartEvent, False),
                    exit_support=distance.supported.get(
                        ProcessExitEvent, False),
                    traffic_support=distance.supported.get(
                        TrafficEvent, False))
                clustering[sample[0]] = tree_index
        print("---> performed clustering with eta %s and epsilon %s" %
              (eta, epsilon))
        results.setdefault(
            "meta", {})["algorithm"] = clustering.clusterer.__class__.__name__
        results.setdefault("meta", {})["eta"] = eta
        results.setdefault("meta", {})["epsilon"] = epsilon
        for cluster in clustering:
            results.setdefault("clusters",
                               []).append([node.key for node in cluster
                                           ])  # TODO: determine CR
        for noise in clustering.clusterer.noise:
            results.setdefault("noise", []).append(noise.key)
        for score in [
                silhouette_score, calinski_harabasz_score, davies_bouldin_score
        ]:
            try:
                the_score = score(clustering.clusterer.clusters,
                                  clustering.clusterer.graph)
            except ValueError:
                the_score = None
            results.setdefault("scores", {})[score.__name__] = the_score

    output_results(ctx=ctx,
                   results=results,
                   version=determine_version(
                       os.path.dirname(assess_workflows.__file__)),
                   source="%s (%s)" % (__file__, "perform_clustering"))
def perform_precalculated_clustering(ctx, eta, epsilon):
    results = {}

    if ctx.obj.get("use_input", False):
        configuration = ctx.obj.get("configurations", None)[0]
        distance = configuration.get("distances", [None])[0]
        structure = ctx.obj.get("structure", None)
        file_path = structure.input_file_path(
            file_type="csv")  # expecting csv file

        graph = _create_graph(ctx, file_path)
        for single_eta in eta:
            for single_epsilon in epsilon:
                start = time.time()
                clustering = DenGraphIO(base_graph=graph,
                                        cluster_distance=single_epsilon,
                                        core_neighbours=single_eta)
                end = time.time()
                cluster_distance = ClusterDistance(distance=distance())
                clustering.graph.distance = cluster_distance
                print(
                    "---> performed clustering with eta %s and epsilon %s in %s"
                    % (single_eta, single_epsilon, end - start))
                results.setdefault("results", []).append({})
                current_result = results["results"][-1]
                current_result.setdefault(
                    "meta", {})["algorithm"] = clustering.__class__.__name__
                current_result.setdefault("meta", {})["eta"] = single_eta
                current_result.setdefault("meta",
                                          {})["epsilon"] = single_epsilon
                current_result["duration"] = end - start
                for cluster_idx, cluster in enumerate(clustering):
                    current_result.setdefault("clusters", []).append(
                        [node.key for node in cluster])  # TODO: determine CR
                    print("[cluster %s] %s" % (cluster_idx, len(cluster)))
                print("[noise] %s" % len(clustering.noise))
                for noise in clustering.noise:
                    current_result.setdefault("noise", []).append(noise.key)
                # for score in [silhouette_score, calinski_harabasz_score, davies_bouldin_score]:
                # for score in [silhouette_score]:
                for score in []:
                    try:
                        the_score = score(clustering.clusters,
                                          clustering.graph)
                    except ValueError:
                        the_score = None
                    current_result.setdefault("scores",
                                              {})[score.__name__] = the_score
                    print("Got a %s of %s" % (score.__name__, the_score))

    output_results(
        ctx=ctx,
        results=results,
        version=determine_version(os.path.dirname(assess_workflows.__file__)),
        source="%s (%s)" % (__file__, "perform_precalculated_clustering"))
def index_data_by_number_of_payloads(ctx, paths):
    results = {}
    for path in paths:
        for filename in glob.glob("%s/*/*-process.csv" % path):
            count = _payload_line_count(filename)
            results.setdefault(count, []).append(filename)
    output_results(
        ctx=ctx,
        results=results,
        version=determine_version(os.path.dirname(assess_workflows.__file__)),
        source="%s (%s)" % (__file__, "index_data_by_number_of_payloads"))
Esempio n. 9
0
def full_statistics(ctx, pcount):
    """
    Method prepares full statistics about a dataset. The output is as follows:

    {
        <filename>: {
            "node_count": <int>,  # number of nodes in tree
            "complete_node_count": <int>,  # number of nodes in tree w attributes
            "nodes_with_attribute_count": <int>,  # number of nodes that contain attributes
            "alphabet_count": <int>,  # alphabet count
            "duration": <int>,  # duration of tree
            "fanout": [<int>, ...],  # fanout of nodes
            "complete_fanout": [<int>, ...]  # fanout of nodes w attributes
            "depth": [<int>, ...],  # depth in tree for leaves
            "complete_depth": [<int>, ...],  # depth in tree for leaves w attributes
            "attribute_event_count": [<int>, ...]  # events for attributes per node
        }
    }

    :param ctx:
    :param pcount:
    :return:
    """
    results = MulticoreResult()
    ctx.obj["json"] = True
    if ctx.obj.get("use_input", False):
        structure = ctx.obj.get("structure", None)
        file_path = structure.input_file_path()

        with open(file_path, "r") as input_file:
            analysis_files = json.load(input_file).get("data", None)
            data = []
            for node_count, tree_paths in analysis_files.items():
                for tree_path in tree_paths:
                    if isinstance(tree_path, list):
                        for path in tree_path:
                            data.append({"filepath": path})
                    else:
                        data.append({"filepath": tree_path})
            if pcount > 1:
                multicore_result = do_multicore(count=pcount,
                                                target=_full_statistics,
                                                data=data)
                for result in multicore_result:
                    results += result
            else:
                for elem in data:
                    results += _full_statistics(elem)
    output_results(ctx=ctx,
                   results=results,
                   version=determine_version(
                       os.path.dirname(assess_workflows.__file__)),
                   source="%s (%s)" % (__file__, "full_statistics"))
def validate_representatives(ctx, eta, epsilon, threshold):
    """
    Method performs a clustering and from this resulting clusters, calculates the cluster
    representatives to measure our current current representation. For validation of the cluster
    representatives we evaluate the distance of each clustered object to the calculated cluster
    representative. Whenever the distance is bigger than the given epsilon for clustering, we
    have a bad representation.

    :param ctx:
    :return:
    """
    if ctx.obj.get("use_input", False):
        configuration = ctx.obj.get("configurations", None)[0]
        distance_cls = configuration.get("distances", [None])[0]
        structure = ctx.obj.get("structure", None)
        file_path = structure.input_file_path(
            file_type="csv")  # expecting csv file

        graph = _create_graph(ctx, file_path)
        clustering = DenGraphIO(base_graph=graph,
                                cluster_distance=epsilon,
                                core_neighbours=eta)
        cluster_distance = ClusterDistance(distance=distance_cls(),
                                           threshold=threshold)
        clustering.graph.distance = cluster_distance

        results = {
            "meta": {
                "eta": eta,
                "epsilon": epsilon,
                "threshold": threshold
            },
            "clusters": {}
        }
        # calculate CRs from clusters
        for cluster_index, cluster in enumerate(clustering):
            cluster_representative = cluster_distance.mean(list(cluster))
            for tree_object in cluster:
                # calculate distance to cluster representative
                distance = cluster_distance(cluster_representative,
                                            tree_object)
                results["clusters"].setdefault(cluster_index, {}).setdefault(
                    "tree", []).append(tree_object.key)
                results["clusters"].setdefault(cluster_index, {}).setdefault(
                    "distance", []).append(distance)

        output_results(ctx=ctx,
                       results=results,
                       version=determine_version(
                           os.path.dirname(assess_workflows.__file__)),
                       source="%s (%s)" %
                       (__file__, "validate_representatives"))
def aggregate_samples(ctx, skip_key):
    """
    Method aggregates nested dictionaries into a flat one. If it already is a flat dictionary,
    than data is kept and written.

    :param ctx:
    :return:
    """
    results = {}

    if ctx.obj.get("use_input", False):
        structure = ctx.obj.get("structure", None)
        file_path = structure.input_file_path()

        with open(file_path, "r") as input_file:
            input_data = json.load(input_file).get("data", None)

            for key, values in input_data.items():
                if isinstance(values, list):
                    results.setdefault(key if not skip_key else "samples",
                                       []).extend(values)
                else:
                    try:
                        if len(values[0]) > 1:
                            # flattening data
                            results.setdefault(key, []).append([
                                element for value in values[0]
                                for element in value
                            ])
                        else:
                            # data can be kept
                            results.setdefault(key, []).append(values[0])
                    except KeyError:
                        to_check = [values]
                        while to_check:
                            current_item = to_check.pop(0)
                            try:
                                while current_item:
                                    _, value = current_item.popitem()
                                    to_check.append(value)
                            except KeyError:
                                results.setdefault(key,
                                                   []).append(current_item)
                            except AttributeError:
                                results.setdefault(key,
                                                   []).extend(current_item)

    output_results(ctx=ctx,
                   results=results,
                   version=determine_version(
                       os.path.dirname(assess_workflows.__file__)),
                   source="%s (%s)" % (__file__, "aggregate_samples"))
Esempio n. 12
0
def batch_process_from_pkl(ctx, pcount, reverse):
    results = _init_results()
    results["distance"] = []
    results["prototypes"] = []
    if ctx.obj.get("use_input", False):
        structure = ctx.obj.get("structure", None)
        file_path = structure.input_file_path(file_type="pkl")
        with open(file_path, "r") as input_file:
            data = []
            # HEADER
            # ######
            # results are split into a header and data files
            # see data_generation_cli.generate_perturbated_tree
            tree_metadata = pickle.load(input_file)
            results["files"] = tree_metadata.keys()
            for key, pkl_path in tree_metadata.items():
                # tree is stored in "tree"
                # distorted trees in "perturbated_tree"
                data.append({
                    "data_pkl_path": pkl_path,
                    "data_pkl_key": key,
                    "configurations": ctx.obj["configurations"],
                    "reverse": reverse
                })
            if pcount > 1:
                result_list = (do_multicore(
                    count=pcount,
                    data=data,
                    target=_process_configurations_for_row))
                for result in result_list:
                    results["results"].append(result['results'])
                    results["distance"].append(result['precalculated_costs'])
                    results["prototypes"].append(result["prototypes"])
            else:
                for elem in data:
                    result = _process_configurations_for_row(elem)
                    results["results"].append(result["results"])
                    results["distance"].append(result["precalculated_costs"])
                    results["prototypes"].append(result["prototypes"])
    output_results(ctx=ctx,
                   results=results,
                   version=determine_version(os.path.dirname(assess.__file__)),
                   source="%s (%s)" % (__file__, "batch_process_from_pkl"))
def index_tree_statistics(ctx, paths, pcount):
    filenames = []
    results = MulticoreResult()
    for path in paths:
        filenames.extend(_relevant_files_for_context(ctx, path))
    if pcount > 1:
        result_list = do_multicore(count=pcount,
                                   target=_tree_statistics,
                                   data=filenames)
        for result in result_list:
            results += result
    else:
        for filename in filenames:
            results += _tree_statistics(filename)

    output_results(ctx=ctx,
                   results=results,
                   version=determine_version(
                       os.path.dirname(assess_workflows.__file__)),
                   source="%s (%s)" % (__file__, "index_tree_statistics"))
def index_process_names(ctx, paths, pcount):
    filenames = []
    result_set = set()
    for path in paths:
        filenames.extend(_relevant_files_for_context(ctx, path))
    if pcount > 1:
        result_list = do_multicore(count=pcount,
                                   target=_process_names,
                                   data=filenames)
        for result in result_list:
            result_set.union(result)
    else:
        for filename in filenames:
            result_set.union(_process_names(filename))

    output_results(ctx=ctx,
                   results={"process_names": [name for name in result_set]},
                   version=determine_version(
                       os.path.dirname(assess_workflows.__file__)),
                   source="%s (%s)" % (__file__, "index_process_names"))
def transform_mapping_to_sql(ctx, paths):
    if ctx.obj.get("use_input", False) or len(paths) > 0:
        result = ""
        update_cmd = "update payload_result set payload_id='%s' where id=%s;\n"

        for path in paths:
            print("starting with %s" % path)
            with open(path, "r") as input_file:
                reader = csv.reader(input_file, quotechar="'")
                for row in reader:
                    result += update_cmd % (re.match(
                        "([\d-]+)", row[1]).group(), re.match("(\d+)",
                                                              row[0]).group())

        output_results(ctx=ctx,
                       results=result + ";",
                       version=determine_version(
                           os.path.dirname(assess_workflows.__file__)),
                       source="%s (%s)" %
                       (__file__, "transform_mapping_to_sql"))
def subset_data(ctx, include_key):
    """
    Example Usage of this method:
        include_key="'lambda key, value: int(key) > 25'"

    :param ctx:
    :param include_key:
    :return:
    """
    results = {}
    if ctx.obj.get("use_input", False):
        include_key = eval(include_key)
        structure = ctx.obj.get("structure", None)
        file_path = structure.input_file_path()
        with open(file_path, "r") as input_file:
            input_data = json.load(input_file).get("data", None)
            for key, value in input_data.items():
                if isinstance(value, dict):
                    for inner_key, inner_value in value.items():
                        if isinstance(inner_value, dict):
                            for inner_inner_key, inner_inner_value in inner_value.items(
                            ):
                                if include_key(inner_inner_key,
                                               inner_inner_value):
                                    results.setdefault(key, {}).setdefault(
                                        inner_key, {}
                                    )[inner_inner_key] = inner_inner_value
                        else:
                            if include_key(inner_key, inner_value):
                                results.setdefault(key,
                                                   {})[inner_key] = inner_value
                else:
                    if include_key(key, value):
                        results[key] = value

    output_results(ctx=ctx,
                   results=results,
                   version=determine_version(
                       os.path.dirname(assess_workflows.__file__)),
                   source="%s (%s)" % (__file__, "subset_data"))
def pick_samples(ctx, seed, repeat, count, minimum, skip_key):
    results = {}

    if seed is not None:
        random.seed(seed)
    if ctx.obj.get("use_input", False):
        structure = ctx.obj.get("structure", None)
        file_path = structure.input_file_path()

        with open(file_path, "r") as input_file:
            input_data = json.load(input_file).get("data", None)
            working_data = input_data
            try:
                if skip_key:
                    working_data = {
                        value
                        for values in input_data.values() for value in values
                    }
                for key, values in working_data.items():
                    try:
                        for _ in range(repeat):
                            results.setdefault(key, []).append(
                                random.sample(values, count))
                    except ValueError:
                        if minimum == 0 or minimum <= len(values):
                            results.setdefault(key, []).append(values)
                        else:
                            continue
            except AttributeError:
                key = "samples"
                for _ in range(repeat):
                    results.setdefault(key, []).append(
                        random.sample(working_data, count))

    output_results(ctx=ctx,
                   results=results,
                   version=determine_version(
                       os.path.dirname(assess_workflows.__file__)),
                   source="%s (%s)" % (__file__, "pick_samples"))
def index_data_by_number_of_events(ctx, paths):
    results = {}
    filenames = []
    for path in paths:
        filenames.extend(_relevant_files_for_context(ctx, path))
    for filename in filenames:
        basename = os.path.basename(filename)
        db_id = basename.split("-")[0]
        # access process file
        count = _line_count(
            filename=filename) * 2  # start and finishing of a process
        # access traffic file (if existent)
        traffic_count = _line_count(
            filename=os.path.join(os.path.dirname(filename), "%s-traffic.csv" %
                                  db_id))
        results.setdefault(count + traffic_count, []).append(filename)

    output_results(
        ctx=ctx,
        results=results,
        version=determine_version(os.path.dirname(assess_workflows.__file__)),
        source="%s (%s)" % (__file__, "index_data_by_number_of_events"))
Esempio n. 19
0
def analyse_diamond_perturbations(ctx, pcount):
    results = MulticoreResult()
    ctx.obj["json"] = True
    if ctx.obj.get("use_input", False):
        structure = ctx.obj.get("structure", None)
        file_path = structure.input_file_path()
        signature_builders = ctx.obj.get("configurations",
                                         [{}])[0].get("signatures", [])

        with open(file_path, "r") as input_file:
            analysis_files = json.load(input_file).get("data", None)
            if pcount > 1:
                # combine data
                data = [{
                    "filepath": path[0],
                    "signature_builders": signature_builders
                } for paths in analysis_files.values() for path in paths]
                multicore_results = do_multicore(
                    count=pcount,
                    target=_analyse_diamond_perturbation,
                    data=data)
                for result in multicore_results:
                    results += result
            else:
                for tree_paths in analysis_files.values():
                    for tree_path in tree_paths:
                        results += _analyse_diamond_perturbation({
                            "filepath":
                            tree_path[0],
                            "signature_builders":
                            signature_builders
                        })

    output_results(
        ctx=ctx,
        results=results,
        version=determine_version(os.path.dirname(assess_workflows.__file__)),
        source="%s (%s)" % (__file__, "analyse_diamond_perturbation"))
Esempio n. 20
0
def analyse_duration(ctx, pcount):
    """
    Method prepares duration data for further analysis.

    :param ctx:
    :param pcount:
    :return:
    """
    results = MulticoreResult()
    ctx.obj["json"] = True
    if ctx.obj.get("use_input", False):
        structure = ctx.obj.get("structure", None)
        file_path = structure.input_file_path()

        with open(file_path, "r") as input_file:
            analysis_files = json.load(input_file).get("data", None)
            data = []
            for node_count, tree_paths in analysis_files.items():
                for tree_path in tree_paths:
                    if isinstance(tree_path, list):
                        for path in tree_path:
                            data.append({"filepath": path})
                    else:
                        data.append({"filepath": tree_path})
            if pcount > 1:
                multicore_result = do_multicore(count=pcount,
                                                target=_analyse_duration,
                                                data=data)
                for result in multicore_result:
                    results += result
            else:
                for elem in data:
                    results += _analyse_duration(elem)
    output_results(ctx=ctx,
                   results=results,
                   version=determine_version(
                       os.path.dirname(assess_workflows.__file__)),
                   source="%s (%s)" % (__file__, "analyse_duration"))
def index_valid_hdf_trees(ctx, trees, representatives, pcount):
    structure = ctx.obj.get("structure", None)
    results = {}
    paths = [(key, value) for key, values in {
        "trees": trees,
        "representatives": representatives
    }.items() for value in values if values]
    if pcount > 1:
        trees = do_multicore(count=pcount, target=_valid_hdf_tree, data=paths)
        for category, tree, name in trees:
            results.setdefault(category, []).append(
                _write_tree_to_pkl(structure, tree, name))
    else:
        for filename in paths:
            trees = _valid_hdf_tree(filename)
            for category, tree, name in trees:
                results.setdefault(category, []).append(
                    _write_tree_to_pkl(structure, tree, name))
    output_results(ctx=ctx,
                   results=results,
                   version=determine_version(
                       os.path.dirname(assess_workflows.__file__)),
                   source="%s (%s)" % (__file__, "index_valid_hdf_trees"))
def index_data_by_activity(ctx):
    results = {}
    with SQLCommand(providerName="PostgresDBProvider",
                    connectionString="dbname=gnm user=gnm") as sql_command:
        fields = [
            "payload_id", "activity", "task_monitor_id", "status_name",
            "workernode.name", "job.run"
        ]
        sql_results = sql_command.execute(
            "select %s from payload_result "
            "inner join workernode on payload_result.workernode_id=workernode.id "
            "inner join payload on payload_result.payload_id=payload.id "
            "inner join job on job.id=payload.job_id "
            "where payload_id!=%%s and"
            "(activity=%%s or activity=%%s or activity=%%s or activity=%%s) and "
            "(status_name=%%s or status_name=%%s or status_name=%%s or status_name=%%s)"
            % ",".join(fields), [
                "", "reprocessing", "production", "analysis", "analysis-crab3",
                "SUCCEEDED", "FAILED", "DONE", "ABORTED"
            ])
        for sql_result in sql_results:
            result = dict(zip(fields, sql_result))
            current_result = results.setdefault(result["activity"], {})
            current_result.setdefault(result["status_name"], {}).setdefault(
                result["task_monitor_id"], []).append(
                    os.path.join(
                        "/home/fq8360/data/gnm/payloads",
                        os.path.join(
                            os.path.join(result["workernode.name"],
                                         result["job.run"]),
                            "%s-process.csv" % result["payload_id"])))
    if ctx.obj.get("save", False):
        output_results(ctx=ctx,
                       results=results,
                       version=determine_version(
                           os.path.dirname(assess_workflows.__file__)),
                       source="%s (%s)" % (__file__, "index_data_by_activity"))
def transform_matrix_to_csv(ctx, key, has_ensemble):
    """
    has_ensemble means that for different identities values are encoded.

    :param ctx:
    :param key:
    :param has_ensemble:
    :return:
    """
    if ctx.obj.get("use_input", False):
        results = ""
        structure = ctx.obj.get("structure", None)
        file_path = structure.input_file_path()

        with open(file_path, "r") as input_file:
            input_data = json.load(input_file).get("data", None)
            files = input_data["files"]
            for result_idx, result in enumerate(input_data["results"][0]):
                decorator = result["decorator"][key]
                maximum_index = len(decorator)
                results += ",".join(files[result_idx])
                results += "\n"
                for row_index in range(0, maximum_index):
                    row = [0 for _ in range(row_index + 1)]
                    for col_index in range(row_index + 1, maximum_index):
                        if has_ensemble:
                            row.append(decorator[col_index][0][row_index])
                        else:
                            row.append(decorator[col_index][row_index])
                    results += "%s\n" % ",".join([str(item) for item in row])
        output_results(ctx=ctx,
                       results=results,
                       file_type="csv",
                       version=determine_version(
                           os.path.dirname(assess_workflows.__file__)),
                       source="%s (%s)" %
                       (__file__, "transform_matrix_to_csv"))
def index_data_by_number_of_nodes(ctx, paths):
    results = {}
    if ctx.obj.get("use_input", False):
        structure = ctx.obj.get("structure", None)
        file_path = structure.input_file_path()
        with open(file_path, "r") as input_file:
            paths = list(paths)
            paths.extend(json.load(input_file)["data"])
    filenames = []
    for path in paths:
        filenames.extend(_relevant_files_for_context(ctx, path))
    for filename in filenames:
        if not os.path.isfile(path):
            count = _line_count(filename)
            results.setdefault(count, []).append(filename)
        else:
            count = _line_count(path)
            results.setdefault(count, []).append(path)

    output_results(
        ctx=ctx,
        results=results,
        version=determine_version(os.path.dirname(assess_workflows.__file__)),
        source="%s (%s)" % (__file__, "index_data_by_number_of_nodes"))
Esempio n. 25
0
def analyse_metric(ctx):
    """
    Method analyses the mean relative deviation for given distance functions to determine if those
    might be metrics, pseudo-metrics, etc.

    :param ctx:
    """
    results = ""
    latex_results = ""
    if ctx.obj.get("use_input", False):
        structure = ctx.obj.get("structure", None)
        file_path = structure.input_file_path()
        data = {}

        with open(file_path, "r") as input_file:
            analysis_data = json.load(input_file).get("data", None)

            for result_idx, result in enumerate(
                    analysis_data.get("results", [])):
                for result_entry in result:
                    algorithm = result_entry.get("algorithm", None)
                    decorator_data = result_entry.get("decorator",
                                                      {}).get("matrix", [])
                    tree_sizes = result_entry.get("decorator", {}).get(
                        "data", [])["prototypes"]["original"][0]
                    diagonal_issues = other_issues = 0
                    diagonal_values = []
                    other_values = []
                    for row_idx, row_data in enumerate(decorator_data):
                        for col_idx in range(row_idx, len(decorator_data)):
                            if row_idx == col_idx:
                                value = row_data[0][col_idx]
                                # we got the diagonal, so check
                                if value is None:
                                    continue
                                diagonal_values.append((
                                    value,
                                    tree_sizes[row_idx] * 2,
                                    value,
                                    tree_sizes[col_idx] * 2,
                                ))
                                if value != 0:
                                    diagonal_issues += 1
                            else:
                                # we got each other value, so check
                                left_value = row_data[0][col_idx]
                                right_value = decorator_data[col_idx][0][
                                    row_idx]
                                if left_value is None or right_value is None:
                                    continue
                                other_values.append((
                                    left_value,
                                    tree_sizes[row_idx] * 2,
                                    right_value,
                                    tree_sizes[col_idx] * 2,
                                ))
                                if left_value != right_value:
                                    other_issues += 1
                    current_algorithm = data.setdefault(algorithm, {})
                    try:
                        current_algorithm["diagonal"].extend(diagonal_values)
                        current_algorithm["other"].extend(other_values)
                    except KeyError:
                        current_algorithm["diagonal"] = diagonal_values
                        current_algorithm["other"] = other_values
            for key, values in data.items():
                statistics_algorithm = re.search("cache_statistics=(\w+)",
                                                 key).group(1)
                statistics_variant = int(
                    float(re.search("weight=(\d\.\d)", key).group(1)) * 10)
                diagonal_mean, diagonal_error = uncorrelated_relative_deviation_and_standard_error(
                    values["diagonal"], 0)
                other_mean, other_error = uncorrelated_relative_deviation_and_standard_error(
                    values["other"])
                # save results
                latex_results += "\\def\%srelativediagonalmean%s{%s}\n" % (
                    statistics_algorithm, chr(statistics_variant + 97),
                    diagonal_mean)
                latex_results += "\\def\%srelativediagonalssd%s{%s}\n" % (
                    statistics_algorithm, chr(statistics_variant + 97),
                    diagonal_error)
                latex_results += "\\def\%srelativemean%s{%s}\n" % (
                    statistics_algorithm, chr(statistics_variant + 97),
                    other_mean)
                latex_results += "\\def\%srelativessd%s{%s}\n" % (
                    statistics_algorithm, chr(statistics_variant + 97),
                    other_error)

    output_results(ctx=ctx,
                   results=latex_results,
                   version=determine_version(
                       os.path.dirname(assess_workflows.__file__)),
                   source="%s (%s)" % (__file__, "analyse_metric"),
                   file_type="tex",
                   comment_sign="%")
Esempio n. 26
0
def analyse_compression(ctx, pcount):
    """
    Method prepares data for further compression analysis. Thus, it collects information on
    * number of nodes in original tree
    * height of tree as an optional information
    * size of the alphabet (optimised by excluding id numbers in names)
    * number of unique identities generated
    * statistics on the trees fanout

    The following output format can be expected

    <number of nodes>: {
        "file": [<string>, ...],
        "alphabet_count": [<int>, ...],
        "tree_height": [<int>, ...],
        "identity_count": {
            <Signature>: [<int>, ...]
        },
        "fanout": {
            "min": [<int>, ...],
            "max": [<int>, ...],
            "mean": [<float>, ...],
            "std": [<float>, ...],
            "full": [[<int>, ...], ...]
        }
    }

    :param ctx:
    :param pcount:
    :return:
    """
    results = MulticoreResult()
    ctx.obj["json"] = True
    if ctx.obj.get("use_input", False):
        structure = ctx.obj.get("structure", None)
        file_path = structure.input_file_path()
        signature_builders = ctx.obj.get("configurations",
                                         [{}])[0].get("signatures", [])

        with open(file_path, "r") as input_file:
            analysis_files = json.load(input_file).get("data", None)
            data = []
            for node_count, tree_paths in analysis_files.items():
                for tree_path in tree_paths:
                    for path in tree_path:
                        data.append({
                            "node_count": node_count,
                            "filepath": path,
                            "signature_builders": signature_builders
                        })
            if pcount > 1:
                multicore_results = do_multicore(count=pcount,
                                                 target=_analyse_compression,
                                                 data=data)
                for result in multicore_results:
                    results += result
            else:
                for elem in data:
                    results += _analyse_compression(elem)

    output_results(ctx=ctx,
                   results=results,
                   version=determine_version(
                       os.path.dirname(assess_workflows.__file__)),
                   source="%s (%s)" % (__file__, "analyse_compression"))
def squish_index_into_ranges(ctx, range_width, maximum_chain):
    results = {}
    if ctx.obj.get("use_input", False):

        def probability_function(one, two):
            return abs(one - two) / (max(one, two) * range_width)

        structure = ctx.obj.get("structure", None)
        file_path = structure.input_file_path()
        with open(file_path, "r") as input_file:
            input_data = json.load(input_file)["data"]
            # number of nodes, events, tmes
            keys = [int(key) for key in input_data.keys()]
            keys.sort()
            # indizes of numbers overlapping within probability_function
            probabilities = [
                index for index, probability in enumerate(
                    probability_function(one, two)
                    for one, two in zip(keys, keys[1:])) if probability <= 1
            ]
            # pick ranges from sequences of overlapping numbers
            while probabilities:
                index = probabilities[0]
                # check all indexes that belong to a chain
                key_chain = [keys[index]]
                merged = {keys[index]: input_data.pop(str(keys[index]))}
                while index in probabilities:
                    probabilities.pop(0)
                    index += 1
                    try:
                        merged[keys[index]] = input_data.pop(str(keys[index]))
                        key_chain.append(keys[index])
                    except KeyError:  # TODO: is this still needed?
                        continue
                # split chain if it is too long
                key_collection = [key_chain]
                while key_collection:
                    current_key = key_collection.pop()
                    if len(current_key) > maximum_chain:
                        if len(current_key) % maximum_chain == 0:
                            split_point = maximum_chain
                        else:
                            split_point = int(
                                len(current_key) /
                                (len(current_key) // maximum_chain + 1))
                        while current_key:
                            key_collection.append(current_key[:split_point])
                            current_key = current_key[split_point:]
                    else:
                        mean = sum(current_key) / len(current_key)
                        merged_files = [merged.pop(key) for key in current_key]
                        input_data[str(mean)] = [
                            value for values in merged_files
                            for value in values
                        ]
            results = input_data

    output_results(ctx=ctx,
                   results=results,
                   version=determine_version(
                       os.path.dirname(assess_workflows.__file__)),
                   source="%s (%s)" % (__file__, "squish_index_into_ranges"))
Esempio n. 28
0
def batch_process_as_vector(ctx, pcount):
    results = []

    if ctx.obj.get("use_input", False):
        structure = ctx.obj.get("structure", None)
        file_path = structure.input_file_path()
        with open(file_path, "r") as input_file:
            input_data = json.load(input_file).get("data")
            data = []
            if "trees" in input_data and "representatives" in input_data:
                trees = input_data.get("trees", [])
                prototypes = input_data.get("representatives", [])
                if len(trees) >= len(prototypes):
                    for tree in trees:
                        data.append({
                            "configurations": ctx.obj["configurations"],
                            "files": [tree],
                            "prototypes": prototypes
                        })
                else:
                    for prototype in prototypes:
                        data.append({
                            "configurations": ctx.obj["configurations"],
                            "files": trees,
                            "prototypes": [prototype]
                        })
            else:
                for key, values in input_data.items():
                    for value in values:
                        if len(value) == 1:
                            # element is file and prototype at the same time
                            value.append(value[0])
                        data.append({
                            "configurations": ctx.obj["configurations"],
                            "files": value[:1],
                            "prototypes": value[1:],
                            "key": key
                        })
            if pcount > 1:
                final_decorators = []
                row_idx = col_idx = -1
                result_list = do_multicore(pcount, _batch_process_as_vector,
                                           data)
                for result_entry in result_list:
                    if len(trees) >= len(prototypes):
                        row_idx += 1
                        if col_idx < 0:
                            col_idx = 0
                    else:
                        col_idx += 1
                        if row_idx < 0:
                            row_idx = 0
                    current_results = result_entry.get("results", [])
                    for decorator_key in current_results[0].get(
                            "decorator", {}):
                        for index, current_result in enumerate(
                                current_results):
                            try:
                                # if decorator already exists, we only need to add current data
                                decorator = final_decorators[index][
                                    decorator_key]
                                current_decorator = type(decorator)()
                                current_decorator._data = current_result.get(
                                    "decorator", {})[decorator_key]
                                current_decorator.row_idx = [row_idx]
                                current_decorator.col_idx = [col_idx]
                                decorator += current_decorator
                            except IndexError:
                                # if decorator does not exist, we load it and will later add data
                                decorator = Decorator.from_name(decorator_key)
                                decorator._data = current_result.get(
                                    "decorator", {})[decorator_key]
                                decorator.row_idx = [row_idx]
                                decorator.col_idx = [col_idx]
                                try:
                                    final_decorators[index].setdefault(
                                        decorator_key, decorator)
                                except IndexError:
                                    final_decorators.append(
                                        {decorator_key: decorator})
                finals = result_list[0]
                finals["files"] = trees
                finals["prototypes"] = prototypes
                for index, final in enumerate(finals.get("results", [])):
                    for value in final_decorators[index].values():
                        data = value.descriptive_data()
                        final.get("decorator", {})[list(
                            data.keys())[0]] = list(data.values())[0]
                results.append(finals)
            else:
                for elem in data:
                    results.append(_batch_process_as_vector(elem))

    output_results(ctx=ctx,
                   results=results,
                   version=determine_version(os.path.dirname(assess.__file__)),
                   source="%s (%s)" % (__file__, "batch_process_as_vector"),
                   file_type=ctx.obj.get("file_type", None))
def perform_classification(ctx, eta, epsilon):
    """
    Method performs a classification. Before the actual classification can be tested, a clustering
    is applied. Those clusters are following used for classification.

    :param ctx:
    :return:
    """
    if ctx.obj.get("use_input", False):
        configuration = ctx.obj.get("configurations", None)[0]
        distance_cls = configuration.get("distances", [None])[0]
        structure = ctx.obj.get("structure", None)
        file_path = structure.input_file_path(
            file_type="csv")  # expecting csv file

        graph = _create_graph(ctx, file_path)
        clustering = DenGraphIO(base_graph=graph,
                                cluster_distance=epsilon,
                                core_neighbours=eta)
        cluster_distance = ClusterDistance(distance=distance_cls(),
                                           threshold=0)
        clustering.graph.distance = cluster_distance
        # calculate CRs from clusters
        prototype_caches = []
        cluster_names = []
        for cluster in clustering:
            for core in cluster.core_nodes:
                cluster_names.append(core.key)
                prototype_caches.append(core)
        # abusing the mean
        prototype_caches = cluster_distance.mean(prototype_caches,
                                                 prototype=cluster_names)

        results = {"files": [], "prototypes": cluster_names[:], "results": []}
        decorator_def = configuration.get("decorator", None)
        for algorithm_def in configuration.get("algorithms", []):
            for signature_def in configuration.get("signatures", []):
                for event_streamer in configuration.get(
                        "event_streamer", [GNMCSVEventStreamer]):
                    signature = signature_def()
                    algorithm = algorithm_def(signature=signature)
                    algorithm.cluster_representatives(
                        signature_prototypes=prototype_caches,
                        prototypes=cluster_names)
                    decorator = decorator_def()
                    decorator.wrap_algorithm(algorithm=algorithm)
                    # starting a new tree not to mix former results with current
                    for node in clustering.graph:
                        tree = event_streamer(csv_path=node.key)
                        results["files"].append(node.key)
                        algorithm.start_tree()
                        for event in tree.event_iter():
                            try:
                                algorithm.add_event(event)
                            except EventNotSupportedException:
                                pass
                        algorithm.finish_tree()
                    if decorator:
                        results["results"].append({
                            "algorithm":
                            "%s" % algorithm,
                            "signature":
                            "%s" % signature,
                            "event_streamer":
                            "%s" %
                            tree if tree is not None else event_streamer(
                                csv_path=None),
                            "decorator":
                            decorator.descriptive_data()
                        })
        output_results(ctx=ctx,
                       results=results,
                       version=determine_version(
                           os.path.dirname(assess_workflows.__file__)),
                       source="%s (%s)" % (__file__, "perform_classification"))
Esempio n. 30
0
def process_as_matrix(ctx, trees, skip_upper, skip_diagonal, pcount):
    if len(trees) == 0 and ctx.obj.get("use_input", False):
        structure = ctx.obj.get("structure", None)
        file_path = structure.input_file_path()
        with open(file_path, "r") as input_file:
            # can be list of lists or flat list
            data = json.load(input_file).get("data")
            try:
                trees = data.values()[0]
            except AttributeError:
                trees = data
            except TypeError:
                # object is dictionary with trees
                trees = data.get("trees", [])
    results = _init_results()
    results["files"] = results["prototypes"] = trees[:]

    # if we have a flat list, check, otherwise, just take it
    if type(trees[0]) == list:
        tree_paths = trees
        nested = True
    else:
        tree_paths = _get_input_files(trees,
                                      minimum=ctx.obj["start"],
                                      maxlen=ctx.obj["maximum"])
        nested = False

    if pcount > 1:
        to_process = []
        if nested:
            to_process = tree_paths
        else:
            to_process.append(tree_paths)
        while to_process:
            data = []
            single_tree_paths = to_process.pop(0)
            # prepare blocks of data
            factor = multicore_factor(len(single_tree_paths))
            block_size = len(single_tree_paths) / float(factor)
            assert block_size > 1, "Blocksize is too small for proper parallelisation: %s" % block_size
            index_value = int(math.ceil(len(single_tree_paths) / block_size))
            for row_idx in range(index_value):
                for col_idx in range(index_value):
                    if skip_upper and col_idx > row_idx:
                        continue
                    row_trees = single_tree_paths[
                        int(row_idx *
                            block_size):min(int(
                                (row_idx + 1) *
                                block_size), len(single_tree_paths))]
                    col_trees = single_tree_paths[
                        int(col_idx *
                            block_size):min(int(
                                (col_idx + 1) *
                                block_size), len(single_tree_paths))]
                    data.append({
                        "tree_paths": row_trees,
                        "prototype_paths": col_trees,
                        "configurations": ctx.obj["configurations"]
                    })
            result_list = do_multicore(count=pcount,
                                       target=_process_as_matrix,
                                       data=data)
            final_decorators = []
            row_idx = 0
            col_idx = -1
            for result_index, result_entry in enumerate(result_list):
                # calculate the exact position within matrix to help decorators updating their results
                col_idx += 1
                if col_idx >= ((row_idx + 1) if skip_upper else index_value):
                    row_idx += 1
                    col_idx = 0
                current_results = result_entry.get("results", [])
                # each of the results has the same configuration of decorators, so we can get one
                # exemplary list of decorators to process all results
                for decorator_key in current_results[0].get("decorator", {}):
                    for index, current_result in enumerate(current_results):
                        try:
                            # if decorator already exists, we only need to add current data
                            decorator = final_decorators[index][decorator_key]
                            current_decorator = type(decorator)()
                            current_decorator._data = current_result.get(
                                "decorator", {})[decorator_key]
                            current_decorator.row_idx = [row_idx]
                            current_decorator.col_idx = [col_idx]
                            decorator += current_decorator
                        except (IndexError, KeyError):
                            # if decorator does not exist, we load it and will later add data
                            decorator = Decorator.from_name(decorator_key)
                            decorator._data = current_result.get(
                                "decorator", {})[decorator_key]
                            decorator.row_idx = [row_idx]
                            decorator.col_idx = [col_idx]
                            try:
                                final_decorators[index].setdefault(
                                    decorator_key, decorator)
                            except IndexError:
                                final_decorators.append(
                                    {decorator_key: decorator})
            # format updated data
            finals = result_list[0]
            for index, final in enumerate(finals.get("results", [])):
                for value in final_decorators[index].values():
                    data = value.descriptive_data()
                    final.get("decorator", {})[list(data.keys())[0]] = list(
                        data.values())[0]
            results.setdefault("results", []).append(finals["results"])
    else:
        to_process = []
        if nested:
            to_process = tree_paths
        else:
            to_process.append(tree_paths)
        while to_process:
            single_tree_paths = to_process.pop(0)
            # build prototypes
            prototypes = _initialise_prototypes(single_tree_paths)

            def path_generator():
                for tree_index, tree_path in enumerate(single_tree_paths):
                    maxlen = len(single_tree_paths)
                    if skip_upper and skip_diagonal:
                        maxlen = tree_index
                    elif skip_upper:
                        maxlen = tree_index + 1
                    yield (tree_path, maxlen)

            results.setdefault("results", []).append(
                _process_configurations(
                    prototypes=prototypes,
                    configurations=ctx.obj["configurations"],
                    event_generator=path_generator))

    output_results(ctx=ctx,
                   results=results,
                   version=os.path.dirname(assess.__file__),
                   source="%s (%s)" % (__file__, "process_as_matrix"),
                   file_type=ctx.obj.get("file_type", None))