Example #1
0
def get_structure_stats(node):
    num_nodes = len(get_nodes_by_type(node, Node))
    sum_nodes = get_nodes_by_type(node, Sum)
    n_sum_nodes = len(sum_nodes)
    n_prod_nodes = len(get_nodes_by_type(node, Product))
    leaf_nodes = get_nodes_by_type(node, Leaf)
    n_leaf_nodes = len(leaf_nodes)
    edges = get_number_of_edges(node)
    layers = get_depth(node)
    params = 0
    for n in sum_nodes:
        params += len(n.children)
    for l in leaf_nodes:
        params += len(l.parameters)

    return """---Structure Statistics---
# nodes             %s
    # sum nodes     %s
    # prod nodes    %s
    # leaf nodes    %s
# params            %s
# edges             %s
# layers            %s""" % (
        num_nodes,
        n_sum_nodes,
        n_prod_nodes,
        n_leaf_nodes,
        params,
        edges,
        layers,
    )
Example #2
0
def get_deepdb_size(spn_ensemble):
    # only deal with single table, only have one spn
    spn = spn_ensemble.spns[0].mspn
    size = 0
    nodes = get_nodes_by_type(spn, Product)
    for node in nodes:
        size += len(node.children) + len(node.scope)

    nodes = get_nodes_by_type(spn, Sum)
    for node in nodes:
        assert len(node.children) == len(node.weights) == len(
            node.cluster_centers)
        assert len(node.cluster_centers[0]) == len(node.scope)
        num_child = len(node.children)
        num_var = len(node.scope)
        size += 2 * num_child + num_var + num_var * num_child  # children, weights, scope, cluster_centers

    nodes = get_nodes_by_type(spn, Categorical)
    for node in nodes:
        assert len(node.scope) == 1
        size += 2 + len(node.p)  # scope, cardinality, p

    nodes = get_nodes_by_type(spn, IdentityNumericLeaf)
    for node in nodes:
        assert len(node.scope) == 1
        assert len(node.unique_vals) + 1 == len(node.prob_sum)
        size += 3 + len(node.unique_vals) + len(
            node.prob_sum
        )  # scope, cardinality, null_value_prob, uniqe_vals, prob_sum

    # assume use 4 bytes to store all integers and floats
    return size * 4 / 1024 / 1024  #MB
Example #3
0
def get_execution_layers(spn):
    all_nodes = set(get_nodes_by_type(spn, ntype=(Sum, Product)))
    next_filter_type = Product
    leaves = get_nodes_by_type(spn, Leaf)
    layers = [np.asarray([n.id for n in leaves])]
    layer_types = [Leaf]
    seen_nodes = set(leaves)
    while len(all_nodes) > 0:
        filtered_nodes = []
        new_all_nodes = set()

        filter_type = next_filter_type
        for n in all_nodes:
            if isinstance(n, filter_type) and set(n.children).issubset(seen_nodes):
                filtered_nodes.append(n)
            else:
                new_all_nodes.add(n)

        if filter_type == Product:
            next_filter_type = Sum
        else:
            next_filter_type = Product

        if len(filtered_nodes) == 0:
            continue

        assert all_nodes == new_all_nodes | set(filtered_nodes)

        layer_types.append(filter_type)
        all_nodes = new_all_nodes
        layers.append(np.asarray([n.id for n in filtered_nodes]))
        seen_nodes.update(filtered_nodes)

    return layers, layer_types
Example #4
0
def get_structure_stats_dict(node):
    node_types = dict(Counter([type(n) for n in get_nodes_by_type(node)]))
    num_nodes = len(get_nodes_by_type(node, Node))
    edges = get_number_of_edges(node)
    layers = get_number_of_layers(node)

    return {
        'nodes': num_nodes,
        'edges': edges,
        'layers': layers
    }.update(node_types)
Example #5
0
def run_experiment(exp, spn, test_data, test_type, exp_lambda):


    outprefix = path + "/spns/%s/" % (exp)

    results_file = "%stime_test_%s_ll_%s.txt" % (outprefix, test_type, OS_name)
    if os.path.isfile(results_file):
        return

    print(exp, test_data.shape, test_type)

    ll, test_time = exp_lambda()
    np.savetxt(results_file, ll, delimiter=";")

    import cpuinfo

    machine = cpuinfo.get_cpu_info()["brand"]

    adds, muls = fpga_count_ops(spn)

    test_n = test_data.shape[0]

    results = OrderedDict()
    results["Experiment"] = exp
    results["OS"] = OS_name
    results["machine"] = machine
    results["test type"] = test_type
    results["expected adds"] = adds
    results["expected muls"] = muls
    results["input rows"] = test_n
    results["input cols"] = test_data.shape[1]
    results["spn nodes"] = len(get_nodes_by_type(spn, Node))
    results["spn sum nodes"] = len(get_nodes_by_type(spn, Sum))
    results["spn prod nodes"] = len(get_nodes_by_type(spn, Product))
    results["spn leaves"] = len(get_nodes_by_type(spn, Leaf))
    results["spn edges"] = get_number_of_edges(spn)
    results["spn layers"] = get_number_of_layers(spn)
    results["time per task"] = test_time
    results["time per instance"] = test_time / test_n
    results["avg ll"] = np.mean(ll, dtype=np.float128)

    results_file_name = "results.csv"

    if not os.path.isfile(results_file_name):
        results_file = open(results_file_name, "w")
        results_file.write(";".join(results.keys()))
        results_file.write("\n")
    else:
        results_file = open(results_file_name, "a")

    results_file.write(";".join(map(str, results.values())))
    results_file.write("\n")
    results_file.close()
Example #6
0
def get_structure_stats_dict(node):
    node_types = dict(Counter([type(n) for n in get_nodes_by_type(node)]))
    num_nodes = len(get_nodes_by_type(node, Node))
    edges = get_number_of_edges(node)
    layers = get_depth(node)

    result = {
        "nodes": num_nodes,
        "edges": edges,
        "layers": layers,
        "count_per_type": node_types
    }
    return result
Example #7
0
def get_structure_stats(node):
    num_nodes = len(get_nodes_by_type(node, Node))
    sum_nodes = len(get_nodes_by_type(node, Sum))
    prod_nodes = len(get_nodes_by_type(node, Product))
    leaf_nodes = len(get_nodes_by_type(node, Leaf))
    edges = get_number_of_edges(node)
    layers = get_number_of_layers(node)

    return """---Structure Statistics---
# nodes             %s
    # sum nodes     %s
    # prod nodes    %s
    # leaf nodes    %s
# edges             %s
# layers            %s""" % (num_nodes, sum_nodes, prod_nodes, leaf_nodes,
                             edges, layers)
def evaluate_spn_statistics(spn_path, target_csv_path, build_time_path):
    csv_list = []

    # SPN learn times
    for filename in os.listdir(spn_path):
        logger.debug(f'Reading {filename}')
        if not filename.startswith("ensemble") or filename.endswith('.zip'):
            continue

        spn_ensemble = read_ensemble(os.path.join(spn_path, filename))
        for spn in spn_ensemble.spns:
            num_nodes = len(get_nodes_by_type(spn.mspn, Node))
            upper_bound = 200 * len(spn.column_names) - 1
            # assert num_nodes <= upper_bound, "Num of nodes upper bound is wrong"
            csv_list.append((filename, spn.learn_time, spn.full_sample_size,
                             spn.min_instances_slice, spn.rdc_threshold,
                             len(spn.relationship_set), len(spn.table_set),
                             " - ".join([table for table in spn.table_set]),
                             len(spn.column_names), num_nodes, upper_bound))

    # HDF create times
    with open(build_time_path) as f:
        hdf_preprocessing_time = int(f.readlines()[0])
        csv_list += [('generate_hdf', hdf_preprocessing_time, 0, 0, 0, 0, 0,
                      "")]

    with open(target_csv_path, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow([
            'filename', 'learn_time', 'full_sample_size',
            'min_instances_slice', 'rdc_threshold', 'no_joins', 'no_tables',
            'tables', 'no_columns', 'structure_stats', 'upper_bound'
        ])
        writer.writerows(csv_list)
Example #9
0
def EM_optimization(spn,
                    data,
                    iterations=5,
                    node_updates=_node_updates,
                    **kwargs):
    for _ in range(iterations):
        lls_per_node = np.zeros((data.shape[0], get_number_of_nodes(spn)))

        # one pass bottom up evaluating the likelihoods
        log_likelihood(spn, data, dtype=data.dtype, lls_matrix=lls_per_node)

        gradients = gradient_backward(spn, lls_per_node)

        R = lls_per_node[:, 0]

        for node_type, func in node_updates.items():  # TODO: do in parallel
            for node in get_nodes_by_type(spn, node_type):
                func(node,
                     node_lls=lls_per_node[:, node.id],
                     node_gradients=gradients[:, node.id],
                     root_lls=R,
                     all_lls=lls_per_node,
                     all_gradients=gradients,
                     data=data,
                     **kwargs)
Example #10
0
def is_structured_decomposable(spn, verbose=False):

    if not is_consistent(spn):
        return False

    nodes = get_nodes_by_type(spn)

    scope_set = set()
    for n in nodes:
        if isinstance(n, Product):
            scope_set.add(tuple(n.scope))
        elif isinstance(n, CLTree):
            vtree = from_dtree_to_vtree(n.dtree_root)
            scope_set.update([tuple(s) for s in vtree.scopes])

    scopes = list(scope_set)
    scopes = [set(t) for t in scopes]
    #
    # ordering is not needed, but useful for printing
    if verbose:
        scopes.sort(key=len)
        for s in scopes:
            print(s)

    for i in range(len(scopes)):
        for j in range(len(scopes)):
            int_len = len(scopes[i].intersection(scopes[j]))
            if int_len != 0 and int_len != min(len(scopes[i]), len(scopes[j])):
                return False

    return True
Example #11
0
def meu(root,
        input_data,
        node_bottom_up_meu=_node_bottom_up_meu,
        in_place=False):
    # valid, err = is_valid(node)
    # assert valid, err
    if in_place:
        data = input_data
    else:
        data = np.copy(input_data)
    nodes = get_nodes_by_type(root)
    utility_scope = set()
    for node in nodes:
        if type(node) is Utility:
            utility_scope.add(node.scope[0])
    assert np.all(np.isnan(data[:, list(utility_scope)])
                  ), "Please specify all utility values as np.nan"
    likelihood_per_node = np.zeros((data.shape[0], len(nodes)))
    meu_per_node = np.zeros((data.shape[0], len(nodes)))
    meu_per_node.fill(np.nan)
    # one pass bottom up evaluating the likelihoods
    likelihood(root, data, dtype=data.dtype, lls_matrix=likelihood_per_node)
    eval_spmn_bottom_up_meu(root,
                            _node_bottom_up_meu,
                            meu_per_node=meu_per_node,
                            data=data,
                            lls_per_node=likelihood_per_node)
    result = meu_per_node[:, root.id]
    return result
Example #12
0
def best_next_decision(root, input_data, in_place=False):
    if in_place:
        data = input_data
    else:
        data = np.copy(input_data)
    nodes = get_nodes_by_type(root)
    dec_dict = {}
    # find all possible decision values
    for node in nodes:
        if type(node) == Max:
            if node.dec_idx in dec_dict:
                dec_dict[node.dec_idx].union(set(node.dec_values))
            else:
                dec_dict[node.dec_idx] = set(node.dec_values)
    next_dec_idx = None
    # find next undefined decision
    for idx in dec_dict.keys():
        if np.all(np.isnan(data[:, idx])):
            next_dec_idx = idx
            break
    assert next_dec_idx != None, "please assign all values of next decision to np.nan"
    # determine best decisions based on meu
    dec_vals = list(dec_dict[next_dec_idx])
    best_decisions = np.full((1, data.shape[0]), dec_vals[0])
    data[:, next_dec_idx] = best_decisions
    meu_best = meu(root, data)
    for i in range(1, len(dec_vals)):
        decisions_i = np.full((1, data.shape[0]), dec_vals[i])
        data[:, next_dec_idx] = decisions_i
        meu_i = meu(root, data)
        best_decisions = np.select([np.greater(meu_i, meu_best), True],
                                   [decisions_i, best_decisions])
        meu_best = np.maximum(meu_i, meu_best)
    return best_decisions
Example #13
0
def generate_adhoc_value_dict(spn):
    val_dict = {}
    for leaf in get_nodes_by_type(spn, Leaf):
        assert (len(leaf.scope) == 1)
        feature_id = leaf.scope[0]

        if feature_id in val_dict:
            if val_dict[feature_id][0] == "numeric":
                v_min, v_max = _get_min_max_numeric_from_leaf(leaf)
                if v_min < val_dict[feature_id][2][0]:
                    val_dict[feature_id][2][0] = v_min
                if v_max > val_dict[feature_id][2][1]:
                    val_dict[feature_id][2][1] = v_max
        else:
            if isinstance(leaf, Categorical):
                val_dict[feature_id] = [
                    "discrete", "Attr_" + str(feature_id),
                    {i: str(i)
                     for i in range(len(leaf.p))}
                ]
            elif isinstance(leaf, Gaussian) or isinstance(
                    leaf, PiecewiseLinear) or isinstance(
                        leaf, IdentityNumeric):
                val_dict[feature_id] = [
                    "numeric", "Attr_" + str(feature_id),
                    _get_min_max_numeric_from_leaf(leaf)
                ]
            else:
                raise Exception("Cannot process node-type: " + str(leaf))

    return val_dict
Example #14
0
def meu(node, input_data, node_top_down_meu=_node_top_down_meu, node_bottom_up_meu=_node_bottom_up_meu, in_place=False):
    valid, err = is_valid(node)
    assert valid, err
    if in_place:
        data = input_data
    else:
        data = np.array(input_data)

    nodes = get_nodes_by_type(node)

    lls_per_node = np.zeros((data.shape[0], len(nodes)))

    # one pass bottom up evaluating the likelihoods
    # log_likelihood(node, data, dtype=data.dtype, node_log_likelihood=node_bottom_up_meu, lls_matrix=lls_per_node)
    likelihood(node, data, dtype=data.dtype, node_likelihood=node_bottom_up_meu, lls_matrix=lls_per_node)

    meu_val = lls_per_node[:, 0]

    instance_ids = np.arange(data.shape[0])

    # one pass top down to decide on the max branch until it reaches a leaf; returns  all_result, decisions at each max node for each instance.
    all_result, all_decisions = eval_spn_top_down_meu(node, node_top_down_meu, parent_result=instance_ids, data=data,
                                                      lls_per_node=lls_per_node)

    decisions = merge_rows_for_decisions(all_decisions)

    return meu_val, decisions
Example #15
0
def is_consistent(node):
    '''
    all children of a product node have different scope
    '''

    assert node is not None

    allchildscope = set()
    for prod_node in reversed(get_nodes_by_type(node, Product)):
        nscope = set(prod_node.scope)

        if len(prod_node.children) == 0:
            return False, "Product node %s has no children" % (prod_node.id)

        allchildscope.clear()
        sum_features = 0
        for child in prod_node.children:
            sum_features += len(child.scope)
            allchildscope.update(child.scope)

        if allchildscope != nscope or sum_features != len(allchildscope):
            return False, "children of (prod) node %s do not have exclusive scope" % (
                prod_node.id)

    return True, None
Example #16
0
def _get_networkx_obj(spn):
    import networkx as nx
    from spn.structure.Base import Sum, Product, Leaf, get_nodes_by_type
    import numpy as np

    all_nodes = get_nodes_by_type(spn)
    logger.info(all_nodes)

    g = nx.Graph()

    labels = {}
    for n in all_nodes:

        if isinstance(n, Sum):
            label = "+"
        elif isinstance(n, Product):
            label = "x"
        else:
            label = "V" + str(n.scope[0])
        g.add_node(n.id)
        labels[n.id] = label

        if isinstance(n, Leaf):
            continue
        for i, c in enumerate(n.children):
            edge_label = ""
            if isinstance(n, Sum):
                edge_label = np.round(n.weights[i], 2)
            g.add_edge(c.id, n.id, weight=edge_label)

    return g, labels
Example #17
0
def EM_optimization(spn, data, iterations=5, node_updates=_node_updates, skip_validation=False, **kwargs):
    if not skip_validation:
        valid, err = is_valid(spn)
        assert valid, "invalid spn: " + err

    lls_per_node = np.zeros((data.shape[0], get_number_of_nodes(spn)))

    for _ in range(iterations):
        # one pass bottom up evaluating the likelihoods
        log_likelihood(spn, data, dtype=data.dtype, lls_matrix=lls_per_node)

        gradients = gradient_backward(spn, lls_per_node)

        R = lls_per_node[:, 0]

        for node_type, func in node_updates.items():
            for node in get_nodes_by_type(spn, node_type):
                func(
                    node,
                    node_lls=lls_per_node[:, node.id],
                    node_gradients=gradients[:, node.id],
                    root_lls=R,
                    all_lls=lls_per_node,
                    all_gradients=gradients,
                    data=data,
                    **kwargs
                )
Example #18
0
def is_valid(node, check_ids=True):

    if check_ids:
        val, err = has_valid_ids(node)
        if not val:
            return val, err

    for n in get_nodes_by_type(node):
        if len(n.scope) == 0:
            return False, "node %s has no scope" % (n.id)
        is_sum = isinstance(n, Sum)
        is_prod = isinstance(n, Product)

        if is_sum:
            if len(n.children) != len(n.weights):
                return False, "node %s has different children/weights" % (n.id)

        if is_sum or is_prod:
            if len(n.children) == 0:
                return False, "node %s has no children" % (n.id)

    a, err = is_consistent(node)
    if not a:
        return a, err

    b, err = is_complete(node)
    if not b:
        return b, err

    return True, None
Example #19
0
def feature_gradient(node,
                     data,
                     node_gradient_functions=_node_feature_gradients,
                     lls_per_node=None):
    """
    Feature gradients are computed for the input query and each feature using
    the backwards automatic differentiation. In mathematicl terms, it computes the
    partial derivatives \partial P(X) / \partial X_i
 

    :param node: Node for the gradient calculation
    :param data: data for the computation. NaN values are implicitely marginalized out
    :param lls_per_node: optional for storing the intermediate results
    """

    all_leaves = get_nodes_by_type(node, Leaf)

    if not lls_per_node:
        lls_per_node = np.full((data.shape[0], get_number_of_nodes(node)),
                               np.nan)
    log_likelihood(node, data, lls_matrix=lls_per_node)

    gradients = np.exp(gradient_backward(node, lls_per_node))

    node_gradients = []

    for spn_node in all_leaves:
        i = spn_node.id
        result = node_gradient_functions[type(spn_node)](spn_node, data)
        node_gradients.append(result * gradients[:, i].reshape(-1, 1))

    node_gradients = np.array(node_gradients)

    return np.nansum(node_gradients, axis=0)
Example #20
0
def sample_instances(node, input_data, rand_gen, node_sampling=_node_sampling, in_place=False):
    """
    Implementing hierarchical sampling

    """

    # first, we do a bottom-up pass to compute the likelihood taking into account marginals.
    # then we do a top-down pass, to sample taking into account the likelihoods.

    if in_place:
        data = input_data
    else:
        data = np.array(input_data)

    valid, err = is_valid(node)
    assert valid, err

    assert np.all(
        np.any(np.isnan(data), axis=1)), "each row must have at least a nan value where the samples will be substituted"

    nodes = get_nodes_by_type(node)

    lls_per_node = np.zeros((data.shape[0], len(nodes)))

    log_likelihood(node, data, dtype=data.dtype, lls_matrix=lls_per_node)

    instance_ids = np.arange(data.shape[0])

    eval_spn_top_down(node, node_sampling, input_vals=instance_ids, data=data, lls_per_node=lls_per_node,
                      rand_gen=rand_gen)

    return data
Example #21
0
def mpe(
    node,
    input_data,
    node_top_down_mpe=_node_top_down_mpe,
    node_bottom_up_mpe_log=_node_bottom_up_mpe_log,
    in_place=False,
):
    valid, err = is_valid(node)
    assert valid, err

    assert np.all(
        np.any(np.isnan(input_data), axis=1)
    ), "each row must have at least a nan value where the samples will be substituted"

    if in_place:
        data = input_data
    else:
        data = np.array(input_data)

    nodes = get_nodes_by_type(node)

    lls_per_node = np.zeros((data.shape[0], len(nodes)))

    # one pass bottom up evaluating the likelihoods
    log_likelihood(node, data, dtype=data.dtype, node_log_likelihood=node_bottom_up_mpe_log, lls_matrix=lls_per_node)

    instance_ids = np.arange(data.shape[0])

    # one pass top down to decide on the max branch until it reaches a leaf, then it fills the nan slot with the mode
    eval_spn_top_down(node, node_top_down_mpe, parent_result=instance_ids, data=data, lls_per_node=lls_per_node)

    return data
Example #22
0
def __get_networkx_obj(spn):
    import networkx as nx
    all_nodes = get_nodes_by_type(spn)
    g = nx.Graph()

    labels = {}
    for n in all_nodes:

        if isinstance(n, Sum):
            label = "+\n{}".format(n.scope)
        elif isinstance(n, Product):
            label = "x"
        elif isinstance(n, Gaussian):
            label = "G" + str(n.scope[0]) + "\n(" + str(round(
                n.mean, 2)) + ", " + str(round(n.stdev, 2)) + ")"
        elif isinstance(n, Categorical):
            vals = [round(x, 2) for x in n.p]
            label = "C" + str(n.scope[0]) + " (" + str(vals) + ")"
        else:
            label = "Unk" + str(n.scope[0])
        g.add_node(n.id)
        labels[n.id] = label

        if isinstance(n, Leaf):
            continue
        for i, c in enumerate(n.children):
            edge_label = ""
            if isinstance(n, Sum):
                edge_label = np.round(n.weights[i], 2)
            g.add_edge(c.id, n.id, weight=edge_label)

    return g, labels
def Expectation(spn, feature_id, ranges, node_expectation, node_likelihood):
    def leaf_expectation(node, data, dtype=np.float64, **kwargs):
        if node.scope[0] == feature_id:
            t_node = type(node)
            if t_node in node_expectation:
                exps = np.zeros((data.shape[0], 1), dtype=dtype)
                exps[:] = node_expectation[t_node](node)
                return exps
            else:
                raise Exception("Node type unknown for expectation: " +
                                str(t_node))
        else:
            t_node = type(node)
            if t_node in node_likelihood:
                return node_likelihood[t_node](node,
                                               ranges,
                                               node_likelihood=node_likelihood)

    node_expectations = {
        type(leaf): leaf_expectation
        for leaf in get_nodes_by_type(spn, Leaf)
    }
    node_expectations.update({Sum: sum_likelihood, Product: prod_likelihood})

    expectation = likelihood(spn, ranges, node_likelihood=node_expectations)
    expectation = expectation / likelihood(
        spn, ranges, node_likelihood=node_likelihood)

    return expectation
Example #24
0
def build_spn(dep_tree, table_keys, scopes, attribute_owners, path_constraints=None, cache=None):
    def build_recursive(dep_tree, table_keys, scopes, attribute_owners, path_constraints=None, cache=None):
        if path_constraints is None:
            path_constraints = []

        new_node = Sum()
        for (table_names_keys, dep_node) in get_dependncy_keys(dep_tree, table_keys, attribute_owners,
                                                               path_constraints):

            for constraint_configuration, cached_node_count in get_constraint_values(table_names_keys, path_constraints,
                                                                                     cache):
                p_node = Product()
                new_node.children.append(p_node)
                count_value = 1

                for cached_node, node_count in cached_node_count:
                    p_node.children.append(cached_node)
                    count_value *= node_count

                for dep_children_node in dep_node.children:
                    if dep_children_node.name[0] == '@':
                        continue

                    node, count = build_recursive(dep_children_node, table_keys, scopes, attribute_owners,
                                                  path_constraints=constraint_configuration,
                                                  cache=cache)
                    p_node.children.append(node)
                    count_value *= count
                new_node.weights.append(count_value)

        wsum = np.sum(new_node.weights)
        # new_node.weights = [w / wsum for w in new_node.weights]

        return new_node, wsum

    root, count = build_recursive(dep_tree, table_keys, scopes, attribute_owners, path_constraints=path_constraints,
                                  cache=cache)
    if True:
        for sum_node in get_nodes_by_type(root, Sum):
            normalization = np.sum(sum_node.weights)
            sum_node.weights = [w / normalization for w in sum_node.weights]
        for cat_node in get_nodes_by_type(root, CategoricalDictionary):
            psum = 0
            for name, count in cat_node.p.items():
                psum += count
            cat_node.p = {name: count / psum for name, count in cat_node.p.items()}
    return root
Example #25
0
    def test_ll_matrix(self):
        add_node_likelihood(Leaf, sum_and_multiplier_ll)

        node_1_1_1_1 = leaf(2, 1)
        node_1_1_1_2 = leaf(2, 2)
        node_1_1_1 = 0.7 * node_1_1_1_1 + 0.3 * node_1_1_1_2
        node_1_1_2 = leaf([0, 1], 3)
        node_1_1 = node_1_1_1 * node_1_1_2
        node_1_2_1_1_1 = leaf(0, 5)
        node_1_2_1_1_2 = leaf(1, 4)
        node_1_2_1_1 = node_1_2_1_1_1 * node_1_2_1_1_2
        node_1_2_1_2 = leaf([0, 1], 6)
        node_1_2_1 = 0.1 * node_1_2_1_1 + 0.9 * node_1_2_1_2
        node_1_2_2 = leaf(2, 3)
        node_1_2 = node_1_2_1 * node_1_2_2
        spn = 0.4 * node_1_1 + 0.6 * node_1_2

        assign_ids(spn)

        max_id = max([n.id for n in get_nodes_by_type(spn)])

        data = np.random.rand(10, 10)

        node_1_1_1_1_r = data[:, 2] * 1
        node_1_1_1_2_r = data[:, 2] * 2
        node_1_1_1_r = 0.7 * node_1_1_1_1_r + 0.3 * node_1_1_1_2_r
        node_1_1_2_r = 3 * (data[:, 0] + data[:, 1])
        node_1_1_r = node_1_1_1_r * node_1_1_2_r
        node_1_2_1_1_1_r = data[:, 0] * 5
        node_1_2_1_1_2_r = data[:, 1] * 4
        node_1_2_1_1_r = node_1_2_1_1_1_r * node_1_2_1_1_2_r
        node_1_2_1_2_r = 6 * (data[:, 0] + data[:, 1])
        node_1_2_1_r = 0.1 * node_1_2_1_1_r + 0.9 * node_1_2_1_2_r
        node_1_2_2_r = data[:, 2] * 3
        node_1_2_r = node_1_2_1_r * node_1_2_2_r
        spn_r = 0.4 * node_1_1_r + 0.6 * node_1_2_r

        self.assert_correct(spn, data, spn_r)

        lls = np.zeros((data.shape[0], max_id + 1))
        likelihood(spn, data, lls_matrix=lls)
        llls = np.zeros((data.shape[0], max_id + 1))
        log_likelihood(spn, data, lls_matrix=llls)

        self.assertTrue(np.alltrue(np.isclose(lls, np.exp(llls))))

        self.assertTrue(np.alltrue(np.isclose(spn_r, lls[:, spn.id])))
        self.assertTrue(np.alltrue(np.isclose(node_1_2_r, lls[:, node_1_2.id])))
        self.assertTrue(np.alltrue(np.isclose(node_1_2_2_r, lls[:, node_1_2_2.id])))
        self.assertTrue(np.alltrue(np.isclose(node_1_2_1_r, lls[:, node_1_2_1.id])))
        self.assertTrue(np.alltrue(np.isclose(node_1_2_1_2_r, lls[:, node_1_2_1_2.id])))
        self.assertTrue(np.alltrue(np.isclose(node_1_2_1_1_r, lls[:, node_1_2_1_1.id])))
        self.assertTrue(np.alltrue(np.isclose(node_1_2_1_1_2_r, lls[:, node_1_2_1_1_2.id])))
        self.assertTrue(np.alltrue(np.isclose(node_1_2_1_1_1_r, lls[:, node_1_2_1_1_1.id])))
        self.assertTrue(np.alltrue(np.isclose(node_1_1_r, lls[:, node_1_1.id])))
        self.assertTrue(np.alltrue(np.isclose(node_1_1_2_r, lls[:, node_1_1_2.id])))
        self.assertTrue(np.alltrue(np.isclose(node_1_1_1_r, lls[:, node_1_1_1.id])))
        self.assertTrue(np.alltrue(np.isclose(node_1_1_1_2_r, lls[:, node_1_1_1_2.id])))
        self.assertTrue(np.alltrue(np.isclose(node_1_1_1_1_r, lls[:, node_1_1_1_1.id])))
Example #26
0
def Expectation(spn,
                feature_scope,
                evidence_scope,
                evidence,
                node_expectation=_node_expectation):
    """Compute the Expectation:

        E[X_feature_scope | X_evidence_scope] given the spn and the evidence data

    Keyword arguments:
    spn -- the spn to compute the probabilities from
    feature_scope -- set() of integers, the scope of the features to get the expectation from
    evidence_scope -- set() of integers, the scope of the evidence features
    evidence -- numpy 2d array of the evidence data
    """

    if evidence_scope is None:
        evidence_scope = set()

    assert not (len(evidence_scope) > 0 and evidence is None)

    assert len(feature_scope.intersection(evidence_scope)) == 0

    marg_spn = marginalize(spn, keep=feature_scope | evidence_scope)

    def leaf_expectation(node, data, dtype=np.float64, **kwargs):
        if node.scope[0] in feature_scope:
            t_node = type(node)
            if t_node in node_expectation:
                exps = np.zeros((data.shape[0], 1), dtype=dtype)
                exps[:] = node_expectation[t_node](node)
                return exps
            else:
                raise Exception('Node type unknown: ' + str(t_node))

        return likelihood(node, evidence)

    node_expectations = {
        type(leaf): leaf_expectation
        for leaf in get_nodes_by_type(marg_spn, Leaf)
    }
    node_expectations.update({Sum: sum_likelihood, Product: prod_likelihood})

    if evidence is None:
        #fake_evidence is not used
        fake_evidence = np.zeros((1, len(spn.scope))).reshape(1, -1)
        expectation = likelihood(marg_spn,
                                 fake_evidence,
                                 node_likelihood=node_expectations)
        return expectation

    #if we have evidence, we want to compute the conditional expectation
    expectation = likelihood(marg_spn,
                             evidence,
                             node_likelihood=node_expectations)
    expectation = expectation / likelihood(
        marginalize(marg_spn, keep=evidence_scope), evidence)

    return expectation
Example #27
0
def plot_spn2(spn, fname="plot.pdf"):
    import networkx as nx
    from networkx.drawing.nx_pydot import graphviz_layout

    import matplotlib.pyplot as plt
    from spn.structure.Base import Sum, Product, Leaf, get_nodes_by_type
    import numpy as np

    all_nodes = get_nodes_by_type(spn)

    g = nx.DiGraph()

    labels = {}
    edge_labels = {}
    for n in all_nodes:

        if isinstance(n, Sum):
            label = "+"
        elif isinstance(n, Product):
            label = "x"
        else:
            label = "V" + str(n.scope[0])
        g.add_node(n.id)
        labels[n.id] = label

        if isinstance(n, Leaf):
            continue
        for i, c in enumerate(n.children):
            edge_label = ""
            if isinstance(n, Sum):
                edge_label = np.round(n.weights[i], 2)
            g.add_edge(c.id, n.id, weight=edge_label)

    pos = graphviz_layout(g, prog='dot', args="height=200")
    #pos = nx.drawing.layout.rescale_layout(pos, 10)
    plt.figure(figsize=(18, 12))
    ax = plt.gca()
    ax.invert_yaxis()

    nx.draw(g,
            pos,
            with_labels=True,
            arrows=False,
            node_color='#DDDDDD',
            edge_color='#DDDDDD',
            width=1,
            node_size=250,
            labels=labels,
            font_size=6)
    ax.collections[0].set_edgecolor("#888888")
    edge_labels = nx.draw_networkx_edge_labels(
        g,
        pos=pos,
        edge_labels=nx.get_edge_attributes(g, 'weight'),
        font_size=5,
        clip_on=False,
        alpha=0.6)
    plt.tight_layout()
    plt.savefig(fname)
Example #28
0
def validate_ids(node):
    all_nodes = get_nodes_by_type(node)

    ids = set()
    for n in all_nodes:
        ids.add(n.id)

    assert len(ids) == len(all_nodes), "not all nodes have ID's"

    assert min(ids) == 0 and max(ids) == len(ids) - 1, "ID's are not in order"
Example #29
0
 def python_eval_func(data):
     num_nodes = len(get_nodes_by_type(node))
     # It has to be this way - otherwise the data doesn't appear contiguous in CPP.
     # np.ascontiguousarray doesn't seem to work either.
     results = []
     for _ in range(num_nodes):
         results += np.zeros(shape=(data.shape[0]), dtype="float32").tolist()
     results = np.array(results).reshape((data.shape[0], num_nodes))
     spn_many(data, results, results.shape[0])
     return results
Example #30
0
def init_spn_sampling(node):
    all_nodes = get_nodes_by_type(node)

    map_id_nodes = {}
    for n in all_nodes:
        map_id_nodes[n.id] = n

    reset_node_counters(node)

    return map_id_nodes