Python MulticoreResult Examples, assess_workflows.utils.multicoreresult.MulticoreResult Python Examples

Example #1

0

Show file

File: data_selection_cli.py Project: eileen-kuehn/assess_workflows

def _tree_statistics(filename):
    result = MulticoreResult()
    tree_builder = CSVTreeBuilder()
    try:
        tree = tree_builder.build(filename)
    except DataNotInCacheException:
        tree = None
    except TreeInvalidatedException:
        tree = None
    if tree is not None:
        for event in tree.event_iter():
            file_dict = result.setdefault(filename, {
                "process": {},
                "traffic": {},
                "traffic_count": {}
            })
            if isinstance(event, ProcessStartEvent) or isinstance(
                    event, ProcessExitEvent):
                file_dict["process"][event.tme] = file_dict["process"].get(
                    event.tme, 0) + 1
            elif isinstance(event, TrafficEvent):
                file_dict["traffic"][event.tme] = file_dict["traffic"].get(
                    event.tme, 0) + 1
                file_dict["traffic_count"][
                    event.tme] = file_dict["traffic_count"].get(
                        event.tme, 0) + (event.in_cnt + event.out_cnt)
    return result

Example #2

0

Show file

File: data_selection_cli.py Project: eileen-kuehn/assess_workflows

def _data_by_tme(filename):
    results = MulticoreResult()
    tree_builder = CSVTreeBuilder()
    try:
        tree = tree_builder.build(filename)
    except (DataNotInCacheException, TreeInvalidatedException):
        pass
    else:
        if tree is not None:
            node = next(tree.node_iter())
            results.setdefault(node.tme, []).append(filename)
    return results

Example #3

0

Show file

File: data_selection_cli.py Project: eileen-kuehn/assess_workflows

def _data_by_uid(filename):
    results = MulticoreResult()
    tree_builder = CSVTreeBuilder()
    try:
        tree = tree_builder.build(filename)
    except (DataNotInCacheException, TreeInvalidatedException):
        pass
    else:
        if tree is not None:
            uids = set()
            for node in tree.node_iter():
                if node.uid not in uids:
                    uids.add(node.uid)
                    results.setdefault(node.uid, []).append(filename)
    return results

Example #4

0

Show file

def analyse_diamonds(ctx, pcount):
    """
    Method returns output file that follows the following format:

    {
        node_count: {
            p_value: {
                "raw": [[diamond levels], ...],
                "identities": [identity_count, ...],
                "diamonds": [diamond_count, ...],
                "files": [file_path, ...]
            }
        }
    }

    :param ctx:
    :param pcount:
    :return:
    """
    results = MulticoreResult()
    ctx.obj["json"] = True
    if ctx.obj.get("use_input", False):
        structure = ctx.obj.get("structure", None)
        file_path = structure.input_file_path()
        signature_builders = ctx.obj.get("configurations",
                                         [{}])[0].get("signatures", [])

        with open(file_path, "r") as input_file:
            analysis_files = json.load(input_file).get("data", None)
            if pcount > 1:
                data = [{
                    "node_count": node_count,
                    "filepath": tree_path[0],
                    "signature_builders": signature_builders
                } for node_count, tree_paths in analysis_files.items()
                        for tree_path in tree_paths]
                multicore_results = do_multicore(count=pcount,
                                                 target=_analyse_diamonds,
                                                 data=data)
                for result in multicore_results:
                    results += result
            else:
                for node_count, tree_paths in analysis_files.items():
                    for tree_path in tree_paths:
                        results += _analyse_diamonds({
                            "node_count":
                            node_count,
                            "filepath":
                            tree_path[0],
                            "signature_builders":
                            signature_builders
                        })

    output_results(ctx=ctx,
                   results=results,
                   version=determine_version(
                       os.path.dirname(assess_workflows.__file__)),
                   source="%s (%s)" % (__file__, "analyse_diamonds"))

Example #5

0

Show file

def full_statistics(ctx, pcount):
    """
    Method prepares full statistics about a dataset. The output is as follows:

    {
        <filename>: {
            "node_count": <int>,  # number of nodes in tree
            "complete_node_count": <int>,  # number of nodes in tree w attributes
            "nodes_with_attribute_count": <int>,  # number of nodes that contain attributes
            "alphabet_count": <int>,  # alphabet count
            "duration": <int>,  # duration of tree
            "fanout": [<int>, ...],  # fanout of nodes
            "complete_fanout": [<int>, ...]  # fanout of nodes w attributes
            "depth": [<int>, ...],  # depth in tree for leaves
            "complete_depth": [<int>, ...],  # depth in tree for leaves w attributes
            "attribute_event_count": [<int>, ...]  # events for attributes per node
        }
    }

    :param ctx:
    :param pcount:
    :return:
    """
    results = MulticoreResult()
    ctx.obj["json"] = True
    if ctx.obj.get("use_input", False):
        structure = ctx.obj.get("structure", None)
        file_path = structure.input_file_path()

        with open(file_path, "r") as input_file:
            analysis_files = json.load(input_file).get("data", None)
            data = []
            for node_count, tree_paths in analysis_files.items():
                for tree_path in tree_paths:
                    if isinstance(tree_path, list):
                        for path in tree_path:
                            data.append({"filepath": path})
                    else:
                        data.append({"filepath": tree_path})
            if pcount > 1:
                multicore_result = do_multicore(count=pcount,
                                                target=_full_statistics,
                                                data=data)
                for result in multicore_result:
                    results += result
            else:
                for elem in data:
                    results += _full_statistics(elem)
    output_results(ctx=ctx,
                   results=results,
                   version=determine_version(
                       os.path.dirname(assess_workflows.__file__)),
                   source="%s (%s)" % (__file__, "full_statistics"))

Example #6

0

Show file

def _analyse_duration(kwargs):
    """
    Generates the following structure:

    <duration>: [<file>, ...]

    :param filepath: Path for tree to consider
    :param kwargs:
    :return:
    """
    result = MulticoreResult()
    filepath = kwargs.get("filepath", None)
    tree_builder = CSVTreeBuilder()
    try:
        tree = tree_builder.build(filepath)
    except (DataNotInCacheException, TreeInvalidatedException):
        pass
    else:
        if tree is not None:
            root = tree.root()
            duration = root.exit_tme - root.tme
            result.setdefault(duration, []).append(filepath)
    return result

Example #7

0

Show file

File: data_selection_cli.py Project: eileen-kuehn/assess_workflows

def index_tree_statistics(ctx, paths, pcount):
    filenames = []
    results = MulticoreResult()
    for path in paths:
        filenames.extend(_relevant_files_for_context(ctx, path))
    if pcount > 1:
        result_list = do_multicore(count=pcount,
                                   target=_tree_statistics,
                                   data=filenames)
        for result in result_list:
            results += result
    else:
        for filename in filenames:
            results += _tree_statistics(filename)

    output_results(ctx=ctx,
                   results=results,
                   version=determine_version(
                       os.path.dirname(assess_workflows.__file__)),
                   source="%s (%s)" % (__file__, "index_tree_statistics"))

Example #8

0

Show file

def analyse_diamond_perturbations(ctx, pcount):
    results = MulticoreResult()
    ctx.obj["json"] = True
    if ctx.obj.get("use_input", False):
        structure = ctx.obj.get("structure", None)
        file_path = structure.input_file_path()
        signature_builders = ctx.obj.get("configurations",
                                         [{}])[0].get("signatures", [])

        with open(file_path, "r") as input_file:
            analysis_files = json.load(input_file).get("data", None)
            if pcount > 1:
                # combine data
                data = [{
                    "filepath": path[0],
                    "signature_builders": signature_builders
                } for paths in analysis_files.values() for path in paths]
                multicore_results = do_multicore(
                    count=pcount,
                    target=_analyse_diamond_perturbation,
                    data=data)
                for result in multicore_results:
                    results += result
            else:
                for tree_paths in analysis_files.values():
                    for tree_path in tree_paths:
                        results += _analyse_diamond_perturbation({
                            "filepath":
                            tree_path[0],
                            "signature_builders":
                            signature_builders
                        })

    output_results(
        ctx=ctx,
        results=results,
        version=determine_version(os.path.dirname(assess_workflows.__file__)),
        source="%s (%s)" % (__file__, "analyse_diamond_perturbation"))

Example #9

0

Show file

def analyse_duration(ctx, pcount):
    """
    Method prepares duration data for further analysis.

    :param ctx:
    :param pcount:
    :return:
    """
    results = MulticoreResult()
    ctx.obj["json"] = True
    if ctx.obj.get("use_input", False):
        structure = ctx.obj.get("structure", None)
        file_path = structure.input_file_path()

        with open(file_path, "r") as input_file:
            analysis_files = json.load(input_file).get("data", None)
            data = []
            for node_count, tree_paths in analysis_files.items():
                for tree_path in tree_paths:
                    if isinstance(tree_path, list):
                        for path in tree_path:
                            data.append({"filepath": path})
                    else:
                        data.append({"filepath": tree_path})
            if pcount > 1:
                multicore_result = do_multicore(count=pcount,
                                                target=_analyse_duration,
                                                data=data)
                for result in multicore_result:
                    results += result
            else:
                for elem in data:
                    results += _analyse_duration(elem)
    output_results(ctx=ctx,
                   results=results,
                   version=determine_version(
                       os.path.dirname(assess_workflows.__file__)),
                   source="%s (%s)" % (__file__, "analyse_duration"))

Example #10

0

Show file

def _analyse_compression(kwargs):
    """
    Generates the following structure:

    <number of nodes>: {                    # binning node count
        "file": [<string>, ...],
        "node_count": [<int>, ...],         # real node counts
        "alphabet_count": [<int>, ...],
        "tree_height": [<int>, ...],
        "identity_count": {
            <Signature>: [<int>, ...]
        },
        "fanout": {
            "min": [<int>, ...],
            "max": [<int>, ...],
            "mean": [<float>, ...],
            "std": [<float>, ...],
            "full": [[<int>, ...], ...]
        }
    }

    :param filepath: Path for tree to consider
    :param node_count: Number of nodes within tree
    :param signature_builders: Signature builders to consider for generation of identities
    :param kwargs:
    :return:
    """
    filepath = kwargs.get("filepath", None)
    node_count = kwargs.get("node_count", None)
    signature_builders = kwargs.get("signature_builders", None)
    result = MulticoreResult()
    tree_builder = CSVTreeBuilder()
    try:
        tree = tree_builder.build(filepath)
    except (DataNotInCacheException, TreeInvalidatedException):
        pass
    else:
        if tree is not None:
            alphabet = set()
            fanout = []
            # prepare generic data first
            for node in tree.node_iter():
                if len(node.children_list()) > 0:
                    fanout.append(len(node.children_list()))
                alphabet.add(node.name)
            for signature_builder in signature_builders:
                signature = signature_builder()
                compression = [set() for _ in range(signature.count)]
                for node in tree.node_iter():
                    identities = signature.get_signature(node, node.parent())
                    for index, identity in enumerate(identities):
                        compression[index].add(identity)
                # write results
                # {node_count: "identity_count": {signature_1: [value, ...], signature_2: [value, ...]}}
                current = result.setdefault(node_count, {}).setdefault(
                    "identity_count", {})
                for index, single_signature in enumerate(
                        signature._signatures):
                    current.setdefault(repr(single_signature),
                                       []).append(len(compression[index]))
            result.setdefault(node_count, {}).setdefault("file",
                                                         []).append(filepath)
            result.setdefault(node_count,
                              {}).setdefault("alphabet_count",
                                             []).append(len(alphabet))
            result.setdefault(node_count,
                              {}).setdefault("node_count",
                                             []).append(tree.node_count())
            current_fanout = result.setdefault(node_count,
                                               {}).setdefault("fanout", {})
            current_fanout.setdefault("min", []).append(min(fanout))
            current_fanout.setdefault("max", []).append(max(fanout))
            current_fanout.setdefault("mean", []).append(
                sum(fanout) / float(len(fanout)))
            current_fanout.setdefault("std",
                                      []).append(standard_deviation(fanout))
            current_fanout.setdefault("full", []).append(fanout)
            # TODO: not supported by tree yet
            # result.setdefault(node_count, {}).setdefault("tree_height", []).append(tree.depth)
    return result

Example #11

0

Show file

def _analyse_diamonds(kwargs):
    """
    Method expects an ensemble signature in configuration were signature at position 0 has length
    n - 1 whereas signature at position 1 has length n (criterium for diamonds). It then builds
    a dictionary for given signatures from position 0 and builds a collection from signatures at
    position 1. The number of signatures that are associated to the different keys is then relevant
    to determine the diamonds. When more than one signature is assigned, then we got a diamond.

    Method creates different fields in output file:

    * raw: contains the levels of the diamonds within a given tree
    * identities: number of identities for the whole tree
    * diamonds: number of diamonds within the tree (independent from level)
    * diamond_nodes: number of nodes that make up the diamonds
    * files: files that were used

    In addition, all of these fields are associated to a given signature_builder. It defines the
    actual height that is analysed. Meaning, the p value that is used to index the output file.

    {
        node_count: {
            p_value: {
                "raw": {
                    "levels": [[diamond level, ...], ...],
                    "nodes": [[diamond nodes, ...], ...]
                }
                "identities": [identity_count, ...],
                "diamonds": [diamond_count, ...],
                "diamond_nodes": [diamond_node_count, ...],
                "node_counts": [node_count, ...],
                "files": [file_path, ...]
            }
        }
    }

    :param kwargs: dict containing keys node_count, filepath and signature_builders
    :return:
    """
    node_count = kwargs.get("node_count", None)
    filepath = kwargs.get("filepath", None)
    signature_builders = kwargs.get("signature_builders", None)
    result = MulticoreResult()
    tree_builder = CSVTreeBuilder()
    try:
        tree = tree_builder.build(filepath)
    except (DataNotInCacheException, TreeInvalidatedException):
        pass
    else:
        if tree is not None:
            for signature_builder in signature_builders:
                signature = signature_builder()
                node_dict = {}
                current_node_count = 0
                for node in tree.node_iter():
                    current_node_count += 1
                    current_signatures = signature.get_signature(
                        node, node.parent())
                    current_node = node_dict.setdefault(
                        current_signatures[0], {})
                    current_node.setdefault("nodes", set()).add(node)
                    current_node.setdefault("signatures",
                                            set()).add(current_signatures[1])
                diamonds = {
                    signature: {
                        "nodes":
                        len(signature_values.get("nodes", set())),
                        "levels":
                        len(signature_values.get("signatures", set())) - 1
                    }
                    for signature, signature_values in node_dict.items()
                    if len(signature_values.get("signatures", set())) > 1
                }
                current_result = result.setdefault(node_count, {}).setdefault(
                    signature._signatures[0]._height, {})
                raw_result = current_result.setdefault("raw", {
                    "levels": [],
                    "nodes": []
                })
                raw_result["levels"].append([
                    diamond.get("levels", 0) for diamond in diamonds.values()
                ])
                raw_result["nodes"].append(
                    [diamond.get("nodes", 0) for diamond in diamonds.values()])
                current_result.setdefault("node_counts",
                                          []).append(current_node_count)
                current_result.setdefault("identities",
                                          []).append(len(node_dict))
                current_result.setdefault("diamonds", []).append(len(diamonds))
                current_result.setdefault("diamond_nodes", []).append(
                    sum([
                        diamond.get("nodes", 0)
                        for diamond in diamonds.values()
                    ]))
                current_result.setdefault("files", []).append(filepath)
    return result

Example #12

0

Show file

def _analyse_diamond_perturbation(kwargs):
    """
    {
        p_count: {
            diamond_count: {
                "profile_distortions": [],              # profile distortion based on frequency
                "profile_distortions_signatures": [],   # profile distortion based on set count
                "distance_errors": []                   # distance error based on frequency
                "distance_errors_signatures": []        # distance error based on set count
                "signature_counts": [],                 # nr of signatures in tree
                "node_counts": [],                      # nr of nodes in tree
                "raw": [{
                    "level": diamond_level,
                    "nested": nesting_level,
                    "nodes": node_count,
                    "signatures": signature_count
                }, ...]
            }
        }
    }

    :param kwargs: dict with keys filepath and signature_builders
    :return:
    """
    filepath = kwargs.get("filepath", None)
    signature_builders = kwargs.get("signature_builders", None)
    tree_builder = CSVTreeBuilder()
    perturbation_results = MulticoreResult()
    try:
        tree = tree_builder.build(filepath)
    except (DataNotInCacheException, TreeInvalidatedException):
        pass
    else:
        if tree is not None:
            for signature_builder in signature_builders:
                diamonds = {}
                node_signatures = set()
                signature = signature_builder()
                node_count = 0
                for node in tree.node_iter():
                    node_count += 1
                    current_signature = signature.get_signature(
                        node, node.parent())
                    node_signatures.add(current_signature[0])
                    diamond = diamonds.setdefault(current_signature[0], {})
                    diamond.setdefault("signatures",
                                       set()).add(current_signature[1])
                    diamond.setdefault("nodes", set()).add(node)
                diamonds = {
                    key: diamond
                    for key, diamond in diamonds.items()
                    if len(diamond.get("signatures", set())) > 1
                }
                diamond_perturbation = {}
                for diamond_key, diamond in diamonds.items():
                    # found a diamond, that represents several diamond nodes
                    result = diamond_perturbation.setdefault(
                        diamond_key, {
                            "nested": 0,
                            "nodes": set(),
                            "signatures": set()
                        })
                    result["level"] = max(
                        0,
                        len(diamond.get("signatures", set())) - 1)
                    for node in diamond.get("nodes"):
                        to_check = set(node.children_list())
                        result["nodes"].add(node)
                        result["signatures"].add(
                            signature.get_signature(node, node.parent)[0])
                        while to_check:
                            child = to_check.pop()
                            result["nodes"].add(child)
                            child_signatures = signature.get_signature(
                                child, child.parent())
                            result["signatures"].add(child_signatures[0])
                            to_check.update(child.children_list())
                            if child_signatures[0] in diamonds:
                                # diamond is a nested diamond, so initialise it here
                                diamond_perturbation[child_signatures[0]] = {
                                    "level": 1,
                                    "nested": result["nested"] + 1,
                                    "nodes": set(),
                                    "signatures": set()
                                }
                diamond_count = len(diamond_perturbation)
                perturbation_result = perturbation_results.setdefault(
                    signature._signatures[0]._height,
                    {}).setdefault(diamond_count, {})
                perturbation_result.setdefault(
                    "profile_distortions", []).append(
                        sum([
                            len(diamond.get("nodes", [])) * diamond["level"]
                            for diamond in diamond_perturbation.values()
                        ]))
                perturbation_result.setdefault(
                    "profile_distortions_signatures", []).append(
                        sum([
                            len(diamond.get("signatures", [])) *
                            diamond["level"]
                            for diamond in diamond_perturbation.values()
                        ]))
                perturbation_result.setdefault("distance_errors", []).append(
                    sum([
                        len(diamond.get("nodes", []))
                        for diamond in diamond_perturbation.values()
                    ]))
                perturbation_result.setdefault(
                    "distance_errors_signatures", []).append(
                        sum([
                            len(diamond.get("signatures", []))
                            for diamond in diamond_perturbation.values()
                        ]))
                perturbation_result.setdefault("signature_counts",
                                               []).append(len(node_signatures))
                perturbation_result.setdefault("node_counts",
                                               []).append(node_count)
                perturbation_result.setdefault("raw", []).append({
                    key: {
                        "level": value["level"],
                        "nested": value["nested"],
                        "nodes": len(value["nodes"]),
                        "signatures": len(value["signatures"])
                    }
                    for key, value in diamond_perturbation.items()
                })
    return perturbation_results

Example #13

0

Show file

def analyse_compression(ctx, pcount):
    """
    Method prepares data for further compression analysis. Thus, it collects information on
    * number of nodes in original tree
    * height of tree as an optional information
    * size of the alphabet (optimised by excluding id numbers in names)
    * number of unique identities generated
    * statistics on the trees fanout

    The following output format can be expected

    <number of nodes>: {
        "file": [<string>, ...],
        "alphabet_count": [<int>, ...],
        "tree_height": [<int>, ...],
        "identity_count": {
            <Signature>: [<int>, ...]
        },
        "fanout": {
            "min": [<int>, ...],
            "max": [<int>, ...],
            "mean": [<float>, ...],
            "std": [<float>, ...],
            "full": [[<int>, ...], ...]
        }
    }

    :param ctx:
    :param pcount:
    :return:
    """
    results = MulticoreResult()
    ctx.obj["json"] = True
    if ctx.obj.get("use_input", False):
        structure = ctx.obj.get("structure", None)
        file_path = structure.input_file_path()
        signature_builders = ctx.obj.get("configurations",
                                         [{}])[0].get("signatures", [])

        with open(file_path, "r") as input_file:
            analysis_files = json.load(input_file).get("data", None)
            data = []
            for node_count, tree_paths in analysis_files.items():
                for tree_path in tree_paths:
                    for path in tree_path:
                        data.append({
                            "node_count": node_count,
                            "filepath": path,
                            "signature_builders": signature_builders
                        })
            if pcount > 1:
                multicore_results = do_multicore(count=pcount,
                                                 target=_analyse_compression,
                                                 data=data)
                for result in multicore_results:
                    results += result
            else:
                for elem in data:
                    results += _analyse_compression(elem)

    output_results(ctx=ctx,
                   results=results,
                   version=determine_version(
                       os.path.dirname(assess_workflows.__file__)),
                   source="%s (%s)" % (__file__, "analyse_compression"))

Example #14

0

Show file

def _full_statistics(kwargs):
    """
    :param filepath: Path for tree to consider
    :param kwargs:
    :return:
    """
    filepath = kwargs.get("filepath", None)
    result = MulticoreResult()
    tree_bilder = CSVTreeBuilder()
    try:
        tree = tree_bilder.build(filepath)
    except (DataNotInCacheException, TreeInvalidatedException):
        pass
    else:
        if tree is not None:
            attributes_on_nodes = 0
            nodes_with_attributes = 0
            alphabet = set()
            fanout = []
            complete_fanout = []
            depth = []
            complete_depth = []
            attribute_events = []
            for node in tree.node_iter():
                # check if node has traffic
                attribute_count = 0
                if node.traffic:
                    current_count = 0
                    available_attributes = set()
                    for traffic in node.traffic:
                        if traffic.in_rate > 0:
                            current_count += 1
                            available_attributes.add("%s_in_rate" %
                                                     traffic.conn_cat)
                        if traffic.out_rate > 0:
                            current_count += 1
                            available_attributes.add("%s_out_rate" %
                                                     traffic.conn_cat)
                    attribute_count = len(available_attributes)
                    attributes_on_nodes += attribute_count
                    attribute_events.append(current_count)
                    nodes_with_attributes += 1
                if len(node.children_list()) > 0:
                    # determine fanout
                    fanout.append(len(node.children_list()))
                    if attribute_count > 0:
                        complete_fanout.append(
                            len(node.children_list()) + attribute_count)
                    else:
                        complete_fanout.append(len(node.children_list()))
                else:
                    # node is a leaf, so determine depth in tree
                    current_depth = node.depth()
                    depth.append(current_depth)
                    if attribute_count > 0:
                        complete_depth.extend([
                            current_depth + 1 for _ in range(attribute_count)
                        ])
                    else:
                        complete_depth.append(current_depth)
                alphabet.add(node.name)
            current_result = result.setdefault(filepath, {})
            current_result["node_count"] = tree.node_count()
            current_result["complete_node_count"] = tree.node_count(
            ) + attributes_on_nodes
            current_result[
                "nodes_with_attribute_count"] = nodes_with_attributes
            current_result["alphabet_count"] = len(alphabet)
            current_result["duration"] = tree.root().exit_tme - tree.root().tme
            current_result["fanout"] = fanout
            current_result["complete_fanout"] = complete_fanout
            current_result["depth"] = depth
            current_result["complete_depth"] = complete_depth
            current_result["attribute_event_count"] = attribute_events
    return result

Example #15

0

Show file

File: data_generation_cli.py Project: eileen-kuehn/assess_workflows

def _generate_perturbated_tree(kwargs):
    """
    :param kwargs:
    :param filepath: Path to consider
    :param probabilities: List of probabilites
    :param repeat: How often to repeat a single probability
    :param insert_probability: Probability to insert item
    :param delete_probability: Probability to delete item
    :param change_probability: Probability to change item
    :param move_probability: Probability to move item
    :param leaf_nodes_only: Only include leaf nodes?
    :param internal_nodes_only: Only include internal nodes?
    :param attribute_nodes_only: Only include attribute nodes?
    :param cost: True or False
    :return:
    """
    result = MulticoreResult()
    filepath = kwargs.get("filepath", None)
    probabilities = kwargs.get("probabilities", [])
    repeat = kwargs.get("repeat", 1)
    insert_probability = kwargs.get("insert_probability", 0)
    delete_probability = kwargs.get("delete_probability", 0)
    change_probability = kwargs.get("change_probability", 0)
    move_probability = kwargs.get("move_probability", 0)
    leaf_nodes_only = kwargs.get("leaf_nodes_only", False)
    internal_nodes_only = kwargs.get("internal_nodes_only", False)
    attribute_nodes_only = kwargs.get("attribute_nodes_only", False)
    cost = kwargs.get("cost", True)

    tree_builder = CSVTreeBuilder()
    tree = tree_builder.build(filepath)
    if tree is not None:
        result.setdefault(filepath, {})
        result[filepath]["tree"] = tree
        result[filepath].setdefault("perturbated_tree", {})
        for probability in probabilities:
            if attribute_nodes_only:
                ted_generator = TEDGenerator(
                    costs=[],
                    operation_generator=RandomOperation(
                        delete_probability=1,
                        delete_operation=DeleteAttributeTreeEditOperation(
                            probability=probability)),
                    probability=1,
                    skip_node=skip_all_but_attribute_nodes)
            else:
                ted_generator = TEDGenerator(
                    costs=[
                        TreeEditDistanceCost(),
                        FanoutWeightedTreeEditDistanceCost(),
                        SubtreeWeightedTreeEditDistanceCost(),
                        SubtreeHeightWeightedTreeEditDistanceCost(),
                        SubtreeWeightedTreeEditDistanceCostWithMove()
                    ] if cost else [],
                    operation_generator=RandomOperation(
                        insert_probability=insert_probability,
                        delete_probability=delete_probability,
                        edit_probability=change_probability,
                        move_probability=move_probability),
                    probability=probability,
                    skip_node=skip_leaf if internal_nodes_only else
                    (skip_inner_node if leaf_nodes_only else skip_no_node))
            for _ in range(repeat):
                perturbated_tree = ted_generator.generate(tree)
                result[filepath]["perturbated_tree"].setdefault(
                    probability, []).append(perturbated_tree)
                # reload tree
                tree = tree_builder.build(filepath)
    return result

Example #16

0

Show file

File: data_generation_cli.py Project: eileen-kuehn/assess_workflows

def generate_perturbated_tree(ctx, seed, repeat, probabilities,
                              insert_probability, cost, delete_probability,
                              change_probability, move_probability, pcount,
                              leaf_nodes_only, internal_nodes_only,
                              attribute_nodes_only):
    if seed is not None:
        random.seed(seed)
    results = MulticoreResult()
    if ctx.obj.get("use_input"):
        structure = ctx.obj.get("structure", None)
        with open(structure.input_file_path(), "r") as input_file:
            json_data = json.load(input_file)
            samples = json_data["data"]["samples"]
            if pcount > 1:
                data = [{
                    "filepath": item,
                    "repeat": repeat,
                    "probabilities": probabilities,
                    "insert_probability": insert_probability,
                    "delete_probability": delete_probability,
                    "change_probability": change_probability,
                    "move_probability": move_probability,
                    "leaf_nodes_only": leaf_nodes_only,
                    "internal_nodes_only": internal_nodes_only,
                    "attribute_nodes_only": attribute_nodes_only,
                    "cost": cost
                } for sample in samples for item in sample]
                multicore_results = do_multicore(
                    count=pcount, target=_generate_perturbated_tree, data=data)
                for result in multicore_results:
                    results += result
            else:
                for sample in samples:
                    for item in sample:
                        results += _generate_perturbated_tree({
                            "filepath":
                            item,
                            "repeat":
                            repeat,
                            "probabilities":
                            probabilities,
                            "insert_probability":
                            insert_probability,
                            "delete_probability":
                            delete_probability,
                            "change_probability":
                            change_probability,
                            "move_probability":
                            move_probability,
                            "leaf_nodes_only":
                            leaf_nodes_only,
                            "internal_nodes_only":
                            internal_nodes_only,
                            "attribute_nodes_only":
                            attribute_nodes_only,
                            "cost":
                            cost
                        })
        if ctx.obj.get("save"):
            # instead of storing all results as one, we split them per base tree
            # a header is used to map all individual stores
            results_header = {}
            for name, result in results.items():
                nick = '%s%02s%s' % (hashlib.sha1(name).hexdigest(),
                                     random.getrandbits(8),
                                     time.strftime('%H%M%S'))
                with open(
                        structure.intermediate_file_path(file_type="pkl",
                                                         variant=nick),
                        "w") as output_file:
                    results_header[name] = output_file.name
                    pickle.dump(MulticoreResult({name: result}), output_file)
            with open(structure.intermediate_file_path(file_type="pkl"),
                      "w") as output_file:
                pickle.dump(results_header, output_file)