Esempio n. 1
0
    def generator(num_nodes, seed, weight=None):
        np.random.seed(seed)

        sm = StructureModel()
        nodes = list("".join(x) for x in product(
            string.ascii_lowercase, string.ascii_lowercase))[:num_nodes]
        np.random.shuffle(nodes)
        sm.add_nodes_from(nodes)

        # one edge:
        sm.add_weighted_edges_from([("aa", "ab", weight)])
        return sm
Esempio n. 2
0
    def test_edges_exist(self):
        """All edges should exist"""

        for num_nodes in range(2, 10):
            nodes = [c for i, c in enumerate(ascii_lowercase) if i < num_nodes]
            sm = StructureModel(list(zip(nodes[:-1], nodes[1:])))
            _, ax, _ = plot_structure(sm)
            ax_edges = [
                patch for patch in ax.patches
                if isinstance(patch, plt.patches.FancyArrowPatch)
            ]
            assert len(ax_edges) == num_nodes - 1
    def test_fit_invalid_lv_states(self, lv_states):
        """An error should be raised if the latent variable has invalid states"""

        with pytest.raises(
                ValueError,
                match="Latent variable 'd' contains no states",
        ):
            df, sm, _, _ = naive_bayes_plus_parents()
            sm = StructureModel(list(sm.edges))
            bn = BayesianNetwork(sm)
            bn.add_node("d", [("z", "d")], [])
            bn.fit_latent_cpds("d", lv_states, df)
    def test_remove_edges_below_threshold(self):
        """Edges whose weight is less than a defined threshold should be removed"""

        sm = StructureModel()
        strong_edges = [(1, 2, 1.0), (1, 3, 0.8), (1, 5, 2.0)]
        weak_edges = [(1, 4, 0.4), (2, 3, 0.6), (3, 5, 0.5)]
        sm.add_weighted_edges_from(strong_edges)
        sm.add_weighted_edges_from(weak_edges)

        sm.remove_edges_below_threshold(0.7)

        assert set(sm.edges(data="weight")) == set(strong_edges)
Esempio n. 5
0
    def test_negative_weights(self):
        """Negative edges whose absolute value is greater than the defined threshold should not be removed"""

        sm = StructureModel()
        strong_edges = [(1, 2, -3.0), (3, 1, 0.7), (1, 5, -2.0)]
        weak_edges = [(1, 4, 0.4), (2, 3, -0.6), (3, 5, -0.5)]
        sm.add_weighted_edges_from(strong_edges)
        sm.add_weighted_edges_from(weak_edges)

        sm.remove_edges_below_threshold(0.7)

        assert set(sm.edges(data="weight")) == set(strong_edges)
Esempio n. 6
0
    def __init__(
        self,
        list_of_edges: List[Tuple[str]],
        discretiser_alg: Optional[Dict[str, str]] = None,
        discretiser_kwargs: Optional[Dict[str, Dict[str, Any]]] = None,
        probability_kwargs: Dict[str, Dict[str, Any]] = None,
        return_prob: bool = False,
    ):
        """
        Args:
            list_of_edges (list): Edge list to construct graph
            - if True: return pandas dataframe with predicted probability for each state
            - if False: return a 1-D prediction array
            discretiser_alg (dict): Specify a supervised algorithm to discretise
            each feature in the data. Available options for the dictionary values
            are ['unsupervised', 'tree', 'mdlp']
            - if 'unsupervised': discretise the data using unsupervised method
            - if 'tree': discretise the data using decision tree method
            - if 'mdlp': discretise the data using MDLP method
            discretiser_kwargs (dict): Keyword arguments for discretisation methods.
            Only applicable if discretiser_alg is not None.
            probability_kwargs (dict): keyword arguments for the probability model
            return_prob (bool): choose to return predictions or probability

        Raises:
            KeyError: If an incorrect argument is passed
            ValueError: If the keys in discretiser_alg and discretiser_kwargs differ
        """

        probability_kwargs = probability_kwargs or {
            "method": "BayesianEstimator",
            "bayes_prior": "K2",
        }

        if discretiser_alg is None:
            logging.info("No discretiser algorithm was given "
                         "The training data will not be discretised")
            discretiser_alg = {}

        discretiser_kwargs = discretiser_kwargs or {}

        self._validate_discretiser(discretiser_alg, discretiser_kwargs)

        self.list_of_edges = list_of_edges
        self.structure = StructureModel(self.list_of_edges)
        self.bn = BayesianNetwork(self.structure)
        self.return_prob = return_prob
        self.probability_kwargs = probability_kwargs
        self.discretiser_kwargs = discretiser_kwargs
        self.discretiser_alg = discretiser_alg
        self._target_name = None
        self._discretise_data = None
Esempio n. 7
0
    def test_f1score_generated(self, adjacency_mat_num_stability):
        """Structure learnt from regularisation should have very high f1 score relative to the ground truth"""
        df = pd.DataFrame(
            adjacency_mat_num_stability,
            columns=["a", "b", "c", "d", "e"],
            index=["a", "b", "c", "d", "e"],
        )
        train_model = StructureModel(df.values)
        X = generate_continuous_dataframe(StructureModel(df), 50, noise_scale=1, seed=1)
        g = from_numpy(
            X[["a", "b", "c", "d", "e"]].values, lasso_beta=0.1, w_threshold=0.25
        )
        right_edges = train_model.edges

        n_predictions_made = len(g.edges)
        n_correct_predictions = len(set(g.edges).intersection(set(right_edges)))
        n_relevant_predictions = len(right_edges)
        precision = n_correct_predictions / n_predictions_made
        recall = n_correct_predictions / n_relevant_predictions
        f1_score = 2 * (precision * recall) / (precision + recall)

        assert f1_score > 0.85
Esempio n. 8
0
 def test_display_importerror_mpl(self):
     sm = StructureModel([("a", "b")])
     viz = plot_structure(sm, prog="neato")
     with patch.dict("sys.modules", {"matplotlib": None}):
         reload(display)
         with pytest.raises(
                 ImportError,
                 match=
                 r"display_plot_mpl method requires matplotlib installed.",
         ):
             display.display_plot_mpl(viz)
     # NOTE: must reload display again after patch exit
     reload(display)
Esempio n. 9
0
    def test_return_types_mpl(self):
        sm = StructureModel([("a", "b")])
        viz = plot_structure(sm, prog="neato")
        d = display.display_plot_mpl(viz)
        assert isinstance(d, tuple)
        assert isinstance(d[0], Figure)
        assert isinstance(d[1], Axes)

        _, ax = plt.subplots()
        d = display.display_plot_mpl(viz, ax=ax)
        assert isinstance(d, tuple)
        assert d[0] is None
        assert isinstance(d[1], Axes)
Esempio n. 10
0
 def test_baseline_probability_probit(self, graph, distribution):
     """Test whether probability centered around 50% if no intercept given"""
     graph = StructureModel()
     graph.add_nodes_from(["A"])
     data = generate_binary_data(
         graph,
         1000000,
         distribution=distribution,
         noise_scale=0.1,
         seed=10,
         intercept=False,
     )
     assert 0.45 < data[:, 0].mean() < 0.55
Esempio n. 11
0
    def test_fit_with_missing_feature_in_data(self):
        """An error should be raised if fit is called with missing feature in data"""
        cg = StructureModel()

        cg.add_weighted_edges_from([("a", "e", 1)])
        with pytest.raises(
            KeyError,
            match="The data does not cover all the features found in the Bayesian Network. "
            "Please check the following features: {'e'}",
        ):
            BayesianNetwork(cg).fit_node_states(
                pd.DataFrame([[1, 1, 1, 1]], columns=["a", "b", "c", "d"])
            )
Esempio n. 12
0
 def test_intercept_probability_logit(self, graph, distribution):
     """Test whether probability is not centered around 50% when using an intercept"""
     graph = StructureModel()
     graph.add_nodes_from(["A"])
     data = generate_binary_data(
         graph,
         1000000,
         distribution=distribution,
         noise_scale=0.1,
         seed=10,
         intercept=True,
     )
     mean_prob = data[:, 0].mean()
     assert not np.isclose(mean_prob, 0.5, atol=0.05)
Esempio n. 13
0
 def test_node_positions_respected(self, input_positions,
                                   expected_positions):
     """Nodes should be at the positions provided"""
     sm = StructureModel([("a", "b")])
     _, ax, _ = plot_structure(sm, node_positions=input_positions)
     node_coords = [
         list(coord) for coord in ax.collections[0].get_offsets()
     ]
     assert all([
         node_x == exp_x and node_y == exp_y
         for ((exp_x, exp_y),
              (node_x,
               node_y)) in zip(expected_positions, sorted(node_coords))
     ])
Esempio n. 14
0
    def test_create_inference_with_bad_variable_names_fails(
            self, train_model, train_data_idx):

        model = StructureModel()
        model.add_edges_from([(str(u).replace("a",
                                              "$a"), str(v).replace("a", "$a"))
                              for u, v in train_model.edges])

        train_data_idx.rename(columns={"a": "$a"}, inplace=True)

        bn = BayesianNetwork(model).fit_node_states(train_data_idx)
        bn.fit_cpds(train_data_idx)

        with pytest.raises(ValueError, match="Variable names must match.*"):
            InferenceEngine(bn)
Esempio n. 15
0
 def test_intercept_probability(self, graph, distribution, n_categories):
     """Test whether probability is not centered around 50% when using an intercept"""
     graph = StructureModel()
     graph.add_nodes_from(["A"])
     data = generate_categorical_dataframe(
         graph,
         1000000,
         distribution=distribution,
         n_categories=n_categories,
         noise_scale=0.1,
         seed=10,
         intercept=True,
     )
     assert not np.allclose(
         data.mean(axis=0), 1 / n_categories, atol=0.01, rtol=0)
    def test_equal_weights(self):
        """Edges whose absolute value is equal to the defined threshold should not be removed"""

        sm = StructureModel()
        strong_edges = [(1, 2, 1.0), (1, 5, 2.0)]
        equal_edges = [(1, 3, 0.6), (2, 3, 0.6)]
        weak_edges = [(1, 4, 0.4), (3, 5, 0.5)]
        sm.add_weighted_edges_from(strong_edges)
        sm.add_weighted_edges_from(equal_edges)
        sm.add_weighted_edges_from(weak_edges)

        sm.remove_edges_below_threshold(0.6)

        assert set(sm.edges(data="weight")) == set.union(
            set(strong_edges), set(equal_edges))
Esempio n. 17
0
    def test_different_origins_and_weights(self):
        """The subgraph returned should still have the edge data preserved from the original graph"""

        sm = StructureModel()
        sm.add_weighted_edges_from([(1, 2, 2.0)], origin="unknown")
        sm.add_weighted_edges_from([(1, 3, 1.0)], origin="learned")
        sm.add_weighted_edges_from([(5, 6, 0.7)], origin="expert")

        subgraph = sm.get_target_subgraph(2)

        assert set(subgraph.edges.data("origin")) == {
            (1, 2, "unknown"),
            (1, 3, "learned"),
        }
        assert set(subgraph.edges.data("weight")) == {(1, 2, 2.0), (1, 3, 1.0)}
Esempio n. 18
0
    def test_all_edge_attributes(self):
        """all edge attributes should be set correctly"""
        sm = StructureModel([("a", "b"), ("b", "c")])
        a_graph = plot_structure(sm)

        default_color = a_graph.get_edge("a", "b").attr["color"]
        test_color = "black"

        assert default_color != test_color
        assert all(
            a_graph.get_edge(u, v).attr["color"] != test_color
            for u, v in a_graph.edges())

        a_graph = plot_structure(sm, all_edge_attributes={"color": test_color})
        assert all(
            a_graph.get_edge(u, v).attr["color"] == test_color
            for u, v in a_graph.edges())
Esempio n. 19
0
    def test_fit_missing_states(self):
        """test issues/15: should be possible to fit with missing states"""

        sm = StructureModel([("a", "b"), ("c", "b")])
        bn = BayesianNetwork(sm)

        train = pd.DataFrame(data=[[0, 0, 1], [1, 0, 1], [1, 1, 1]],
                             columns=["a", "b", "c"])
        test = pd.DataFrame(data=[[0, 0, 1], [1, 0, 1], [1, 1, 2]],
                            columns=["a", "b", "c"])
        data = pd.concat([train, test])

        bn.fit_node_states(data)
        bn.fit_cpds(train, method="BayesianEstimator", bayes_prior="K2")

        assert bn.cpds["c"].loc[1][0] == 0.8
        assert bn.cpds["c"].loc[2][0] == 0.2
Esempio n. 20
0
    def test_get_structure(self):
        """The structure retrieved should be the same"""

        sm = StructureModel()

        sm.add_weighted_edges_from([(1, 2, 2.0)], origin="unknown")
        sm.add_weighted_edges_from([(1, 3, 1.0)], origin="learned")
        sm.add_weighted_edges_from([(3, 5, 0.7)], origin="expert")

        bn = BayesianNetwork(sm)

        sm_from_bn = bn.structure

        assert set(sm.edges.data("origin")) == set(sm_from_bn.edges.data("origin"))
        assert set(sm.edges.data("weight")) == set(sm_from_bn.edges.data("weight"))

        assert set(sm.nodes) == set(sm_from_bn.nodes)
Esempio n. 21
0
    def test_all_node_attributes(self):
        """all node attributes should be set correctly"""
        sm = StructureModel([("a", "b")])
        a_graph = plot_structure(sm)

        default_color = a_graph.get_node("a").attr["color"]
        test_color = "black"

        assert default_color != test_color
        assert all(
            a_graph.get_node(node).attr["color"] != test_color
            for node in a_graph.nodes())

        a_graph = plot_structure(sm, all_node_attributes={"color": test_color})
        assert all(
            a_graph.get_node(node).attr["color"] == test_color
            for node in a_graph.nodes())
Esempio n. 22
0
 def test_baseline_probability(self, graph, distribution, n_categories):
     """Test whether probability centered around 50% if no intercept given"""
     graph = StructureModel()
     graph.add_nodes_from(["A"])
     data = generate_categorical_dataframe(
         graph,
         10000,
         distribution=distribution,
         n_categories=n_categories,
         noise_scale=1.0,
         seed=10,
         intercept=False,
     )
     # without intercept, the probabilities should be fairly uniform
     assert np.allclose(data.mean(axis=0),
                        1 / n_categories,
                        atol=0.01,
                        rtol=0)
Esempio n. 23
0
    def test_node_attriibutes(self):
        """specific node attributes should be set correctly"""

        sm = StructureModel([("a", "b"), ("b", "c")])
        a_graph = plot_structure(sm)

        default_color = a_graph.get_node("a").attr["color"]
        test_color = "black"

        assert default_color != test_color
        assert all(
            a_graph.get_node(node).attr["color"] == default_color
            for node in a_graph.nodes())

        a_graph = plot_structure(sm,
                                 node_attributes={"a": {
                                     "color": test_color
                                 }})
        assert all(
            a_graph.get_node(node).attr["color"] == default_color
            for node in a_graph.nodes() if node != "a")
        assert a_graph.get_node("a").attr["color"] == test_color
Esempio n. 24
0
    def test_intercept(self, distribution, noise_scale):
        graph = StructureModel()
        graph.add_node("123")

        data_noint = generate_binary_data(
            graph,
            100000,
            distribution,
            noise_scale=noise_scale,
            seed=10,
            intercept=False,
        )
        data_intercept = generate_binary_data(
            graph,
            100000,
            distribution,
            noise_scale=noise_scale,
            seed=10,
            intercept=True,
        )
        assert not np.isclose(data_noint[:, 0].mean(),
                              data_intercept[:, 0].mean())
Esempio n. 25
0
    def test_empty(self):
        """Should return None if the structure model is empty"""

        sm = StructureModel()
        assert sm.get_largest_subgraph() is None
Esempio n. 26
0
    def test_init_with_origin(self):
        """should be possible to specify origin during init"""

        sm = StructureModel([(1, 2)], origin="learned")
        assert (1, 2, "learned") in sm.edges.data("origin")
Esempio n. 27
0
    def test_init_has_origin(self):
        """Creating a StructureModel using constructor should give all edges unknown origin"""

        sm = StructureModel([(1, 2)])
        assert (1, 2) in sm.edges
        assert (1, 2, "unknown") in sm.edges.data("origin")
Esempio n. 28
0
def generate_structure(
    num_nodes: int,
    degree: float,
    graph_type: str = "erdos-renyi",
    w_min: float = 0.5,
    w_max: float = 0.5,
) -> StructureModel:
    """Simulate random DAG with some expected degree.
    Notes:
        graph_type (str):
            - erdos-renyi: constructs a graph such that the probability of any given edge is degree / (num_nodes - 1)
            - barabasi-albert: constructs a scale-free graph from an initial connected graph of (degree / 2) nodes
            - full: constructs a fully-connected graph - degree has no effect
    Args:
        num_nodes: number of nodes
        degree: expected node degree, in + out
        graph_type (str):
            - erdos-renyi: constructs a graph such that the probability of any given edge is degree / (num_nodes - 1)
            - barabasi-albert: constructs a scale-free graph from an initial connected graph of (degree / 2) nodes
            - full: constructs a fully-connected graph - degree has no effect
        w_min (float): min absolute weight of an edge in the graph
        w_max (float): max absolute weight of an edge in the graph
    Raises:
        ValueError: if invalid arguments are provided
    Returns:
        weighted DAG
    """

    if num_nodes < 2:
        raise ValueError("DAG must have at least 2 nodes")

    w_min, w_max = abs(w_min), abs(w_max)

    if w_min > w_max:
        raise ValueError(
            "Absolute minimum weight must be less than or equal to maximum weight: {} > {}"
            .format(w_min, w_max))

    if graph_type == "erdos-renyi":
        p_threshold = float(degree) / (num_nodes - 1)
        p_edge = (np.random.rand(num_nodes, num_nodes) <
                  p_threshold).astype(float)
        edge_flags = np.tril(p_edge, k=-1)

    elif graph_type == "barabasi-albert":
        m = int(round(degree / 2))
        edge_flags = np.zeros([num_nodes, num_nodes])
        bag = [0]
        for i in range(1, num_nodes):
            dest = np.random.choice(bag, size=m)
            for j in dest:
                edge_flags[i, j] = 1
            bag.append(i)
            bag.extend(dest)

    elif graph_type == "full":  # ignore degree
        edge_flags = np.tril(np.ones([num_nodes, num_nodes]), k=-1)

    else:
        raise ValueError(
            "Unknown graph type {t}. ".format(t=graph_type) +
            "Available types are ['erdos-renyi', 'barabasi-albert', 'full']")

    # randomly permute edges - required because we limited ourselves to lower diagonal previously
    perms = np.random.permutation(np.eye(num_nodes, num_nodes))
    edge_flags = perms.T.dot(edge_flags).dot(perms)

    # random edge weights between w_min, w_max or between -w_min, -w_max
    edge_weights = np.random.uniform(low=w_min,
                                     high=w_max,
                                     size=[num_nodes, num_nodes])
    edge_weights[np.random.rand(num_nodes, num_nodes) < 0.5] *= -1

    adj_matrix = (edge_flags != 0).astype(float) * edge_weights
    graph = StructureModel(adj_matrix)
    return graph
Esempio n. 29
0
# To see the data
#data.to_csv(path_or_buf = dataPath + 'WIKI_USECASES_4_5_fulldata.csv' , sep =',')


data = usecaseData



# %% markdown [markdown]
# ## Step 2: Creating the Network Structure


# %% codecell


carModel: StructureModel = StructureModel()

carModel.add_edges_from([
    (ExertionLevel.var, WorkCapacity.var),
    (ExperienceLevel.var, WorkCapacity.var),
    (TrainingLevel.var, WorkCapacity.var),
    (WorkCapacity.var, AbsenteeismLevel.var),

    (Time.var, WorkCapacity.var),
    (Time.var, AbsenteeismLevel.var),
    (Time.var, ExertionLevel.var),
    (Time.var, ExperienceLevel.var),
    (Time.var, TrainingLevel.var),

    (ProcessType.var, ToolType.var),
    (ToolType.var, InjuryType.var),
Esempio n. 30
0
 def test_all_nodes_included(self, weighted_edges, data):
     """No errors if all the nodes can be found in the columns of training data"""
     cg = StructureModel()
     cg.add_weighted_edges_from(weighted_edges)
     bn = BayesianNetwork(cg).fit_node_states(data)
     assert all(node in data.columns for node in bn.node_states.keys())