def _tree_statistics(filename):
    result = MulticoreResult()
    tree_builder = CSVTreeBuilder()
    try:
        tree = tree_builder.build(filename)
    except DataNotInCacheException:
        tree = None
    except TreeInvalidatedException:
        tree = None
    if tree is not None:
        for event in tree.event_iter():
            file_dict = result.setdefault(filename, {
                "process": {},
                "traffic": {},
                "traffic_count": {}
            })
            if isinstance(event, ProcessStartEvent) or isinstance(
                    event, ProcessExitEvent):
                file_dict["process"][event.tme] = file_dict["process"].get(
                    event.tme, 0) + 1
            elif isinstance(event, TrafficEvent):
                file_dict["traffic"][event.tme] = file_dict["traffic"].get(
                    event.tme, 0) + 1
                file_dict["traffic_count"][
                    event.tme] = file_dict["traffic_count"].get(
                        event.tme, 0) + (event.in_cnt + event.out_cnt)
    return result
Example #2
0
def real_tree(path=None, absolute=False):
    if path is None:
        path = "data/c01-007-102/1/1-process.csv"
    csv_builder = CSVTreeBuilder()
    if absolute:
        return csv_builder.build(path)
    return csv_builder.build(
        os.path.join(os.path.dirname(assess_tests.__file__), path))
def _valid_tree(filename):
    tree_builder = CSVTreeBuilder()
    try:
        tree = tree_builder.build(filename)
        if tree:
            return filename
    except (DataNotInCacheException, TreeInvalidatedException):
        pass
def perform_clustering(ctx, eta, epsilon):
    results = {}

    if ctx.obj.get("use_input", False):
        configuration = ctx.obj.get("configurations", None)[0]
        signature = configuration.get("signatures", [None])[0]
        distance = configuration.get("distances", [None])[0]
        structure = ctx.obj.get("structure", None)
        file_path = structure.input_file_path()

        tree_builder = CSVTreeBuilder()
        clustering = Clustering(distance=distance,
                                cluster_distance=epsilon,
                                core_neighbours=eta)
        with open(file_path, "r") as input_file:
            input_data = json.load(input_file).get("data", None)

            for sample in input_data.get("samples", []):
                tree = tree_builder.build(sample[0])
                # convert tree to index
                tree_index = tree.to_index(
                    signature=signature,
                    start_support=distance.supported.get(
                        ProcessStartEvent, False),
                    exit_support=distance.supported.get(
                        ProcessExitEvent, False),
                    traffic_support=distance.supported.get(
                        TrafficEvent, False))
                clustering[sample[0]] = tree_index
        print("---> performed clustering with eta %s and epsilon %s" %
              (eta, epsilon))
        results.setdefault(
            "meta", {})["algorithm"] = clustering.clusterer.__class__.__name__
        results.setdefault("meta", {})["eta"] = eta
        results.setdefault("meta", {})["epsilon"] = epsilon
        for cluster in clustering:
            results.setdefault("clusters",
                               []).append([node.key for node in cluster
                                           ])  # TODO: determine CR
        for noise in clustering.clusterer.noise:
            results.setdefault("noise", []).append(noise.key)
        for score in [
                silhouette_score, calinski_harabasz_score, davies_bouldin_score
        ]:
            try:
                the_score = score(clustering.clusterer.clusters,
                                  clustering.clusterer.graph)
            except ValueError:
                the_score = None
            results.setdefault("scores", {})[score.__name__] = the_score

    output_results(ctx=ctx,
                   results=results,
                   version=determine_version(
                       os.path.dirname(assess_workflows.__file__)),
                   source="%s (%s)" % (__file__, "perform_clustering"))
def _data_by_tme(filename):
    results = MulticoreResult()
    tree_builder = CSVTreeBuilder()
    try:
        tree = tree_builder.build(filename)
    except (DataNotInCacheException, TreeInvalidatedException):
        pass
    else:
        if tree is not None:
            node = next(tree.node_iter())
            results.setdefault(node.tme, []).append(filename)
    return results
Example #6
0
 def test_simple_clustering(self):
     clusterer = Clustering(distance=StartExitDistance())
     # create an index to cluster
     builder = CSVTreeBuilder()
     tree = builder.build(self.file_path_one)
     tree_two = builder.build(self.file_path_two)
     tree_index = tree.to_index(
         signature=ParentChildByNameTopologySignature())
     tree_two_index = tree_two.to_index(
         signature=ParentChildByNameTopologySignature())
     clusterer[1] = tree_index
     clusterer[1] = tree_two_index
     self.assertEqual(0, len(clusterer.clusterer.clusters))
     self.assertEqual(2, len(clusterer.clusterer.noise))
def _data_by_uid(filename):
    results = MulticoreResult()
    tree_builder = CSVTreeBuilder()
    try:
        tree = tree_builder.build(filename)
    except (DataNotInCacheException, TreeInvalidatedException):
        pass
    else:
        if tree is not None:
            uids = set()
            for node in tree.node_iter():
                if node.uid not in uids:
                    uids.add(node.uid)
                    results.setdefault(node.uid, []).append(filename)
    return results
def _create_graph(ctx, file_path):
    configuration = ctx.obj.get("configurations", None)[0]
    signature = configuration.get("signatures", [None])[0]
    distance_builder = configuration.get("distances", [None])[0]
    statistics_cls = configuration.get("statistics", [None])[0]
    tree_builder = CSVTreeBuilder()
    distance = distance_builder()

    def header_to_cache(tree_path):
        tree = tree_builder.build(tree_path)
        tree_index = tree.to_index(
            signature=signature(),
            start_support=distance.supported.get(ProcessStartEvent, False),
            exit_support=distance.supported.get(ProcessExitEvent, False),
            traffic_support=distance.supported.get(TrafficEvent, False),
            statistics_cls=statistics_cls)
        tree_index.key = tree_path
        return tree_index

    with open(file_path) as csv_file:
        # load the graph from precalculated csv distance values
        graph = graph_io.csv_graph_reader(
            (ln for ln in csv_file if ln[0] != '#' and ln != '\n'),
            nodes_header=header_to_cache,
            symmetric=True)
        return graph
Example #9
0
def _initialise_prototypes(prototype_paths):
    """
    Method initialises the prototype trees from given file paths.

    :param prototype_paths: List of paths to prototypes
    :return: List of trees
    """
    prototypes = []
    tree_builder = CSVTreeBuilder()
    for prototype_path in prototype_paths:
        if prototype_path.endswith(".pkl"):
            with open(prototype_path, "rb") as pkl_file:
                prototypes.append(pickle.load(pkl_file))
        else:
            prototypes.append(tree_builder.build(prototype_path))
    return prototypes
def _process_names(filename):
    result = set()
    tree_builder = CSVTreeBuilder()
    try:
        tree = tree_builder.build(filename)
    except DataNotInCacheException:
        tree = None
    except TreeInvalidatedException:
        tree = None
    if tree is not None:
        for node in tree.node_iter():
            try:
                if "(" in node.node[0]:
                    result.add(node.name)
            except IndexError:
                pass
    return result
Example #11
0
def check_algorithms(paths=None, configurations=None):
    if paths is None:
        paths = []
    if configurations is None:
        configurations = []
    results = {
        "files": paths[:],
        "version": subprocess.check_output(["git", "describe"]).strip(),
        "results": []
    }
    # fill general information
    tree_builder = CSVTreeBuilder()
    prototypes = []
    if options.skip and len(paths) == 2:
        prototypes.append(tree_builder.build(paths.pop(0)))
    else:
        for path in paths:
            prototypes.append(tree_builder.build(path))
    for configuration in configurations:
        for algorithm in configuration["algorithms"]:
            for signature in configuration["signatures"]:
                signature_object = signature()
                alg = algorithm(signature=signature_object)
                alg.prototypes = prototypes
                # TODO: what if there is no decorator at all? Is it possible?
                decorator = configuration["decorator"]
                decorator.wrap_algorithm(alg)
                for index, path in enumerate(paths):
                    if options.no_upper:
                        # TODO: is it ok to ignore no_diagonal when no_upper
                        # is not given?
                        alg.start_tree(maxlen=index + (0 if options.no_diagonal else 1))
                    else:
                        alg.start_tree()
                    for event in GNMCSVEventStreamer(csv_path=path):
                        alg.add_event(event=event)
                    alg.finish_tree()
                results["results"].append({
                    "algorithm": "%s" % alg,
                    "signature": "%s" % signature_object,
                    "decorator": decorator.descriptive_data()
                })
    return results
    def test_attribute_distance(self):
        def distance_buidler(**kwargs):
            distance = StartExitDistance(weight=0, **kwargs)
            distance.supported = {
                ProcessStartEvent: True,
                ProcessExitEvent: True,
                TrafficEvent: True
            }
            return distance

        tree_builder = CSVTreeBuilder()
        tree_1 = tree_builder.build(
            os.path.join(os.path.dirname(assess_tests.__file__),
                         "data/c01-007-102/2/1129-2-process.csv"))
        tree_2 = tree_builder.build(
            os.path.join(os.path.dirname(assess_tests.__file__),
                         "data/c01-007-102/2/1136-3-process.csv"))
        signature = ParentChildByNameTopologySignature()
        algorithm = IncrementalDistanceAlgorithm(
            signature=signature,
            distance=distance_buidler,
            cache_statistics=SplittedStatistics)
        algorithm.prototypes = [tree_1, tree_2]
        decorator = DistanceMatrixDecorator(normalized=False)
        decorator.wrap_algorithm(algorithm)

        algorithm.start_tree()
        for event in tree_1.event_iter(supported=algorithm.supported):
            try:
                algorithm.add_event(event)
            except EventNotSupportedException:
                pass
        algorithm.finish_tree()
        algorithm.start_tree()
        for event in tree_2.event_iter(supported=algorithm.supported):
            try:
                algorithm.add_event(event)
            except EventNotSupportedException:
                pass
        algorithm.finish_tree()
        data = decorator.data()
        print(decorator.data())
        self.assertEqual(4, abs(data[0][0][1] - data[1][0][0]))
    def test_symmetry_optimisation(self):
        tree = CSVTreeBuilder().build(
            os.path.join(os.path.dirname(assess_tests.__file__),
                         "data/c01-007-102/2/1129-2-process.csv"))
        signature = ParentChildByNameTopologySignature()
        algorithm = IncrementalDistanceAlgorithm(
            signature=signature,
            distance=lambda **kwargs: StartExitDistance(weight=0, **kwargs),
            cache_statistics=SplittedStatistics)
        algorithm.prototypes = [tree]
        decorator = DistanceMatrixDecorator(normalized=False)
        decorator.wrap_algorithm(algorithm)

        algorithm.start_tree()
        for event in tree.event_iter(supported=algorithm.supported):
            try:
                algorithm.add_event(event)
            except EventNotSupportedException:
                pass
        algorithm.finish_tree()
        self.assertEqual(0, decorator.data()[0][0][0])
Example #14
0
def calculate_distance_matrix(paths=None, algorithm=None, signature=Signature):
    if paths is None:
        paths = []
    compression = CompressionFactorDecorator()
    decorator = DistanceMatrixDecorator(normalized=True)
    decorator.decorator = compression
    tree_builder = CSVTreeBuilder()
    prototypes = []
    for path in paths:
        prototypes.append(tree_builder.build(path))
    for path in paths:
        alg = algorithm(signature=signature())
        alg.prototypes = prototypes
        decorator.wrap_algorithm(alg)
        for event in GNMCSVEventStreamer(csv_path=path):
            alg.add_event(event=event)
    print("%s" % ", ".join("%.2f" % value for value
                           in compression.compression_factors()))
    print("----------------------")
    for values in decorator.distance_matrix:
        print(", ".join("%.2f" % value for value in values))
    return decorator.distance_matrix
Example #15
0
def _analyse_duration(kwargs):
    """
    Generates the following structure:

    <duration>: [<file>, ...]

    :param filepath: Path for tree to consider
    :param kwargs:
    :return:
    """
    result = MulticoreResult()
    filepath = kwargs.get("filepath", None)
    tree_builder = CSVTreeBuilder()
    try:
        tree = tree_builder.build(filepath)
    except (DataNotInCacheException, TreeInvalidatedException):
        pass
    else:
        if tree is not None:
            root = tree.root()
            duration = root.exit_tme - root.tme
            result.setdefault(duration, []).append(filepath)
    return result
Example #16
0
def _analyse_compression(kwargs):
    """
    Generates the following structure:

    <number of nodes>: {                    # binning node count
        "file": [<string>, ...],
        "node_count": [<int>, ...],         # real node counts
        "alphabet_count": [<int>, ...],
        "tree_height": [<int>, ...],
        "identity_count": {
            <Signature>: [<int>, ...]
        },
        "fanout": {
            "min": [<int>, ...],
            "max": [<int>, ...],
            "mean": [<float>, ...],
            "std": [<float>, ...],
            "full": [[<int>, ...], ...]
        }
    }

    :param filepath: Path for tree to consider
    :param node_count: Number of nodes within tree
    :param signature_builders: Signature builders to consider for generation of identities
    :param kwargs:
    :return:
    """
    filepath = kwargs.get("filepath", None)
    node_count = kwargs.get("node_count", None)
    signature_builders = kwargs.get("signature_builders", None)
    result = MulticoreResult()
    tree_builder = CSVTreeBuilder()
    try:
        tree = tree_builder.build(filepath)
    except (DataNotInCacheException, TreeInvalidatedException):
        pass
    else:
        if tree is not None:
            alphabet = set()
            fanout = []
            # prepare generic data first
            for node in tree.node_iter():
                if len(node.children_list()) > 0:
                    fanout.append(len(node.children_list()))
                alphabet.add(node.name)
            for signature_builder in signature_builders:
                signature = signature_builder()
                compression = [set() for _ in range(signature.count)]
                for node in tree.node_iter():
                    identities = signature.get_signature(node, node.parent())
                    for index, identity in enumerate(identities):
                        compression[index].add(identity)
                # write results
                # {node_count: "identity_count": {signature_1: [value, ...], signature_2: [value, ...]}}
                current = result.setdefault(node_count, {}).setdefault(
                    "identity_count", {})
                for index, single_signature in enumerate(
                        signature._signatures):
                    current.setdefault(repr(single_signature),
                                       []).append(len(compression[index]))
            result.setdefault(node_count, {}).setdefault("file",
                                                         []).append(filepath)
            result.setdefault(node_count,
                              {}).setdefault("alphabet_count",
                                             []).append(len(alphabet))
            result.setdefault(node_count,
                              {}).setdefault("node_count",
                                             []).append(tree.node_count())
            current_fanout = result.setdefault(node_count,
                                               {}).setdefault("fanout", {})
            current_fanout.setdefault("min", []).append(min(fanout))
            current_fanout.setdefault("max", []).append(max(fanout))
            current_fanout.setdefault("mean", []).append(
                sum(fanout) / float(len(fanout)))
            current_fanout.setdefault("std",
                                      []).append(standard_deviation(fanout))
            current_fanout.setdefault("full", []).append(fanout)
            # TODO: not supported by tree yet
            # result.setdefault(node_count, {}).setdefault("tree_height", []).append(tree.depth)
    return result
Example #17
0
def _analyse_diamonds(kwargs):
    """
    Method expects an ensemble signature in configuration were signature at position 0 has length
    n - 1 whereas signature at position 1 has length n (criterium for diamonds). It then builds
    a dictionary for given signatures from position 0 and builds a collection from signatures at
    position 1. The number of signatures that are associated to the different keys is then relevant
    to determine the diamonds. When more than one signature is assigned, then we got a diamond.

    Method creates different fields in output file:

    * raw: contains the levels of the diamonds within a given tree
    * identities: number of identities for the whole tree
    * diamonds: number of diamonds within the tree (independent from level)
    * diamond_nodes: number of nodes that make up the diamonds
    * files: files that were used

    In addition, all of these fields are associated to a given signature_builder. It defines the
    actual height that is analysed. Meaning, the p value that is used to index the output file.

    {
        node_count: {
            p_value: {
                "raw": {
                    "levels": [[diamond level, ...], ...],
                    "nodes": [[diamond nodes, ...], ...]
                }
                "identities": [identity_count, ...],
                "diamonds": [diamond_count, ...],
                "diamond_nodes": [diamond_node_count, ...],
                "node_counts": [node_count, ...],
                "files": [file_path, ...]
            }
        }
    }

    :param kwargs: dict containing keys node_count, filepath and signature_builders
    :return:
    """
    node_count = kwargs.get("node_count", None)
    filepath = kwargs.get("filepath", None)
    signature_builders = kwargs.get("signature_builders", None)
    result = MulticoreResult()
    tree_builder = CSVTreeBuilder()
    try:
        tree = tree_builder.build(filepath)
    except (DataNotInCacheException, TreeInvalidatedException):
        pass
    else:
        if tree is not None:
            for signature_builder in signature_builders:
                signature = signature_builder()
                node_dict = {}
                current_node_count = 0
                for node in tree.node_iter():
                    current_node_count += 1
                    current_signatures = signature.get_signature(
                        node, node.parent())
                    current_node = node_dict.setdefault(
                        current_signatures[0], {})
                    current_node.setdefault("nodes", set()).add(node)
                    current_node.setdefault("signatures",
                                            set()).add(current_signatures[1])
                diamonds = {
                    signature: {
                        "nodes":
                        len(signature_values.get("nodes", set())),
                        "levels":
                        len(signature_values.get("signatures", set())) - 1
                    }
                    for signature, signature_values in node_dict.items()
                    if len(signature_values.get("signatures", set())) > 1
                }
                current_result = result.setdefault(node_count, {}).setdefault(
                    signature._signatures[0]._height, {})
                raw_result = current_result.setdefault("raw", {
                    "levels": [],
                    "nodes": []
                })
                raw_result["levels"].append([
                    diamond.get("levels", 0) for diamond in diamonds.values()
                ])
                raw_result["nodes"].append(
                    [diamond.get("nodes", 0) for diamond in diamonds.values()])
                current_result.setdefault("node_counts",
                                          []).append(current_node_count)
                current_result.setdefault("identities",
                                          []).append(len(node_dict))
                current_result.setdefault("diamonds", []).append(len(diamonds))
                current_result.setdefault("diamond_nodes", []).append(
                    sum([
                        diamond.get("nodes", 0)
                        for diamond in diamonds.values()
                    ]))
                current_result.setdefault("files", []).append(filepath)
    return result
Example #18
0
def _analyse_diamond_perturbation(kwargs):
    """
    {
        p_count: {
            diamond_count: {
                "profile_distortions": [],              # profile distortion based on frequency
                "profile_distortions_signatures": [],   # profile distortion based on set count
                "distance_errors": []                   # distance error based on frequency
                "distance_errors_signatures": []        # distance error based on set count
                "signature_counts": [],                 # nr of signatures in tree
                "node_counts": [],                      # nr of nodes in tree
                "raw": [{
                    "level": diamond_level,
                    "nested": nesting_level,
                    "nodes": node_count,
                    "signatures": signature_count
                }, ...]
            }
        }
    }

    :param kwargs: dict with keys filepath and signature_builders
    :return:
    """
    filepath = kwargs.get("filepath", None)
    signature_builders = kwargs.get("signature_builders", None)
    tree_builder = CSVTreeBuilder()
    perturbation_results = MulticoreResult()
    try:
        tree = tree_builder.build(filepath)
    except (DataNotInCacheException, TreeInvalidatedException):
        pass
    else:
        if tree is not None:
            for signature_builder in signature_builders:
                diamonds = {}
                node_signatures = set()
                signature = signature_builder()
                node_count = 0
                for node in tree.node_iter():
                    node_count += 1
                    current_signature = signature.get_signature(
                        node, node.parent())
                    node_signatures.add(current_signature[0])
                    diamond = diamonds.setdefault(current_signature[0], {})
                    diamond.setdefault("signatures",
                                       set()).add(current_signature[1])
                    diamond.setdefault("nodes", set()).add(node)
                diamonds = {
                    key: diamond
                    for key, diamond in diamonds.items()
                    if len(diamond.get("signatures", set())) > 1
                }
                diamond_perturbation = {}
                for diamond_key, diamond in diamonds.items():
                    # found a diamond, that represents several diamond nodes
                    result = diamond_perturbation.setdefault(
                        diamond_key, {
                            "nested": 0,
                            "nodes": set(),
                            "signatures": set()
                        })
                    result["level"] = max(
                        0,
                        len(diamond.get("signatures", set())) - 1)
                    for node in diamond.get("nodes"):
                        to_check = set(node.children_list())
                        result["nodes"].add(node)
                        result["signatures"].add(
                            signature.get_signature(node, node.parent)[0])
                        while to_check:
                            child = to_check.pop()
                            result["nodes"].add(child)
                            child_signatures = signature.get_signature(
                                child, child.parent())
                            result["signatures"].add(child_signatures[0])
                            to_check.update(child.children_list())
                            if child_signatures[0] in diamonds:
                                # diamond is a nested diamond, so initialise it here
                                diamond_perturbation[child_signatures[0]] = {
                                    "level": 1,
                                    "nested": result["nested"] + 1,
                                    "nodes": set(),
                                    "signatures": set()
                                }
                diamond_count = len(diamond_perturbation)
                perturbation_result = perturbation_results.setdefault(
                    signature._signatures[0]._height,
                    {}).setdefault(diamond_count, {})
                perturbation_result.setdefault(
                    "profile_distortions", []).append(
                        sum([
                            len(diamond.get("nodes", [])) * diamond["level"]
                            for diamond in diamond_perturbation.values()
                        ]))
                perturbation_result.setdefault(
                    "profile_distortions_signatures", []).append(
                        sum([
                            len(diamond.get("signatures", [])) *
                            diamond["level"]
                            for diamond in diamond_perturbation.values()
                        ]))
                perturbation_result.setdefault("distance_errors", []).append(
                    sum([
                        len(diamond.get("nodes", []))
                        for diamond in diamond_perturbation.values()
                    ]))
                perturbation_result.setdefault(
                    "distance_errors_signatures", []).append(
                        sum([
                            len(diamond.get("signatures", []))
                            for diamond in diamond_perturbation.values()
                        ]))
                perturbation_result.setdefault("signature_counts",
                                               []).append(len(node_signatures))
                perturbation_result.setdefault("node_counts",
                                               []).append(node_count)
                perturbation_result.setdefault("raw", []).append({
                    key: {
                        "level": value["level"],
                        "nested": value["nested"],
                        "nodes": len(value["nodes"]),
                        "signatures": len(value["signatures"])
                    }
                    for key, value in diamond_perturbation.items()
                })
    return perturbation_results
Example #19
0
def _full_statistics(kwargs):
    """
    :param filepath: Path for tree to consider
    :param kwargs:
    :return:
    """
    filepath = kwargs.get("filepath", None)
    result = MulticoreResult()
    tree_bilder = CSVTreeBuilder()
    try:
        tree = tree_bilder.build(filepath)
    except (DataNotInCacheException, TreeInvalidatedException):
        pass
    else:
        if tree is not None:
            attributes_on_nodes = 0
            nodes_with_attributes = 0
            alphabet = set()
            fanout = []
            complete_fanout = []
            depth = []
            complete_depth = []
            attribute_events = []
            for node in tree.node_iter():
                # check if node has traffic
                attribute_count = 0
                if node.traffic:
                    current_count = 0
                    available_attributes = set()
                    for traffic in node.traffic:
                        if traffic.in_rate > 0:
                            current_count += 1
                            available_attributes.add("%s_in_rate" %
                                                     traffic.conn_cat)
                        if traffic.out_rate > 0:
                            current_count += 1
                            available_attributes.add("%s_out_rate" %
                                                     traffic.conn_cat)
                    attribute_count = len(available_attributes)
                    attributes_on_nodes += attribute_count
                    attribute_events.append(current_count)
                    nodes_with_attributes += 1
                if len(node.children_list()) > 0:
                    # determine fanout
                    fanout.append(len(node.children_list()))
                    if attribute_count > 0:
                        complete_fanout.append(
                            len(node.children_list()) + attribute_count)
                    else:
                        complete_fanout.append(len(node.children_list()))
                else:
                    # node is a leaf, so determine depth in tree
                    current_depth = node.depth()
                    depth.append(current_depth)
                    if attribute_count > 0:
                        complete_depth.extend([
                            current_depth + 1 for _ in range(attribute_count)
                        ])
                    else:
                        complete_depth.append(current_depth)
                alphabet.add(node.name)
            current_result = result.setdefault(filepath, {})
            current_result["node_count"] = tree.node_count()
            current_result["complete_node_count"] = tree.node_count(
            ) + attributes_on_nodes
            current_result[
                "nodes_with_attribute_count"] = nodes_with_attributes
            current_result["alphabet_count"] = len(alphabet)
            current_result["duration"] = tree.root().exit_tme - tree.root().tme
            current_result["fanout"] = fanout
            current_result["complete_fanout"] = complete_fanout
            current_result["depth"] = depth
            current_result["complete_depth"] = complete_depth
            current_result["attribute_event_count"] = attribute_events
    return result
def _generate_perturbated_tree(kwargs):
    """
    :param kwargs:
    :param filepath: Path to consider
    :param probabilities: List of probabilites
    :param repeat: How often to repeat a single probability
    :param insert_probability: Probability to insert item
    :param delete_probability: Probability to delete item
    :param change_probability: Probability to change item
    :param move_probability: Probability to move item
    :param leaf_nodes_only: Only include leaf nodes?
    :param internal_nodes_only: Only include internal nodes?
    :param attribute_nodes_only: Only include attribute nodes?
    :param cost: True or False
    :return:
    """
    result = MulticoreResult()
    filepath = kwargs.get("filepath", None)
    probabilities = kwargs.get("probabilities", [])
    repeat = kwargs.get("repeat", 1)
    insert_probability = kwargs.get("insert_probability", 0)
    delete_probability = kwargs.get("delete_probability", 0)
    change_probability = kwargs.get("change_probability", 0)
    move_probability = kwargs.get("move_probability", 0)
    leaf_nodes_only = kwargs.get("leaf_nodes_only", False)
    internal_nodes_only = kwargs.get("internal_nodes_only", False)
    attribute_nodes_only = kwargs.get("attribute_nodes_only", False)
    cost = kwargs.get("cost", True)

    tree_builder = CSVTreeBuilder()
    tree = tree_builder.build(filepath)
    if tree is not None:
        result.setdefault(filepath, {})
        result[filepath]["tree"] = tree
        result[filepath].setdefault("perturbated_tree", {})
        for probability in probabilities:
            if attribute_nodes_only:
                ted_generator = TEDGenerator(
                    costs=[],
                    operation_generator=RandomOperation(
                        delete_probability=1,
                        delete_operation=DeleteAttributeTreeEditOperation(
                            probability=probability)),
                    probability=1,
                    skip_node=skip_all_but_attribute_nodes)
            else:
                ted_generator = TEDGenerator(
                    costs=[
                        TreeEditDistanceCost(),
                        FanoutWeightedTreeEditDistanceCost(),
                        SubtreeWeightedTreeEditDistanceCost(),
                        SubtreeHeightWeightedTreeEditDistanceCost(),
                        SubtreeWeightedTreeEditDistanceCostWithMove()
                    ] if cost else [],
                    operation_generator=RandomOperation(
                        insert_probability=insert_probability,
                        delete_probability=delete_probability,
                        edit_probability=change_probability,
                        move_probability=move_probability),
                    probability=probability,
                    skip_node=skip_leaf if internal_nodes_only else
                    (skip_inner_node if leaf_nodes_only else skip_no_node))
            for _ in range(repeat):
                perturbated_tree = ted_generator.generate(tree)
                result[filepath]["perturbated_tree"].setdefault(
                    probability, []).append(perturbated_tree)
                # reload tree
                tree = tree_builder.build(filepath)
    return result
Example #21
0
def check_algorithms(tree_paths=None,
                     prototype_paths=None,
                     cluster_representatives_paths=None,
                     configurations=None):
    if tree_paths is None:
        tree_paths = []
    if prototype_paths is None:
        prototype_paths = []
    if cluster_representatives_paths is None:
        cluster_representatives_paths = []
    if configurations is None:
        configurations = []
    results = {
        "files": tree_paths[:],
        "prototypes": prototype_paths[:],
        "version": subprocess.check_output(["git", "describe"]).strip(),
        "results": []
    }
    tree_builder = CSVTreeBuilder()
    prototypes = []
    prototype_signature = None
    if len(cluster_representatives_paths) > 0:
        with open(cluster_representatives_paths[0], "r") as json_file:
            cluster_representatives = json.load(json_file)
        prototype_signature = PrototypeSignatureCache.from_cluster_representatives(
            cluster_representatives["data"])
        for cluster in cluster_representatives["data"].keys():
            prototypes.append(cluster)
    else:
        for path in prototype_paths:
            prototypes.append(tree_builder.build(path))

    if options.pcount > 1:
        for configuration in configurations:
            data = []
            try:
                event_streamers = configuration["event_streamer"]
            except KeyError:
                event_streamers = [GNMCSVEventStreamer]
            for algorithm in configuration["algorithms"]:
                for signature in configuration["signatures"]:
                    for path in tree_paths:
                        data.append({
                            "algorithm":
                            algorithm,  # TODO: CR contains algorithm
                            "signature":
                            signature,  # TODO: CR contains signature
                            "decorator": configuration["decorator"],
                            "tree": path,
                            "prototypes": prototypes,
                            "prototype_signature": prototype_signature,
                            "event_streamers": event_streamers
                        })
            result_list = do_multicore(count=options.pcount,
                                       target=check_single_algorithm,
                                       data=data)
            decorator = None
            for result in result_list:
                if decorator is not None:
                    if repr(decorator.algorithm) == repr(result.algorithm) and \
                            repr(decorator.algorithm.signature) == repr(
                            result.algorithm.signature):
                        decorator.update(result)
                    else:
                        # we identified a new decorator, so save the last one
                        results["results"].append({
                            "algorithm":
                            "%s" % decorator.algorithm,
                            "signature":
                            "%s" % decorator.algorithm.signature,
                            "decorator":
                            decorator.descriptive_data()
                        })
                        decorator = result
                else:
                    decorator = result
            if decorator is not None:
                results["results"].append({
                    "algorithm":
                    "%s" % decorator.algorithm,
                    "signature":
                    "%s" % decorator.algorithm.signature,
                    "decorator":
                    decorator.descriptive_data()
                })
    else:
        for configuration in configurations:
            try:
                event_streamers = configuration["event_streamer"]
            except KeyError:
                event_streamers = [GNMCSVEventStreamer]
            for event_streamer in event_streamers:
                for algorithm in configuration["algorithms"]:
                    for signature in configuration["signatures"]:
                        signature_object = signature()
                        alg = algorithm(signature=signature_object)
                        if prototype_signature is not None:
                            alg.cluster_representatives(
                                signature_prototypes=prototype_signature,
                                prototypes=prototypes)
                        else:
                            alg.prototypes = prototypes
                        decorator = configuration["decorator"]()
                        decorator.wrap_algorithm(alg)
                        streamer = None
                        for path in tree_paths:
                            alg.start_tree()
                            streamer = event_streamer(csv_path=path)
                            for event in streamer:
                                alg.add_event(event=event)
                            alg.finish_tree()
                        results["results"].append({
                            "algorithm":
                            "%s" % alg,
                            "signature":
                            "%s" % signature_object,
                            "event_streamer":
                            "%s" % streamer if streamer is not None else
                            event_streamer(csv_path=None),
                            "decorator":
                            decorator.descriptive_data()
                        })

    return results