Ejemplo n.º 1
0
    def test_auc_node_with_no_parents(self):
        """Should be possible to compute auc for state with no parent nodes"""

        train = pd.DataFrame(
            [[a, b, 0] for a in range(0, 2) for b in range(0, 2)
             for _ in range(1)] + [[a, b, 1] for a in range(0, 2)
                                   for b in range(0, 2)
                                   for _ in range(a * 10 + b * 10 + 1000)] +
            [[a, b, 0] for a in range(2, 4) for b in range(2, 4)
             for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1]
                                                        for a in range(2, 4)
                                                        for b in range(2, 4)
                                                        for _ in range(1)],
            columns=["a", "b", "c"],
        )

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        _, auc = roc_auc(bn, train, "a")
        assert math.isclose(auc, 0.5, abs_tol=0.01)
Ejemplo n.º 2
0
    def test_auc_with_missing_state_in_test(self):
        """AUC should still be calculated correctly with states missing in test set"""

        # equal class (c) weighting to guarantee high ROC expected
        train = pd.DataFrame(
            [[a, b, 0] for a in range(0, 2) for b in range(0, 2)
             for _ in range(1)] + [[a, b, 1] for a in range(0, 2)
                                   for b in range(0, 2)
                                   for _ in range(a * 10 + b * 10 + 1000)] +
            [[a, b, 0] for a in range(2, 4) for b in range(2, 4)
             for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1]
                                                        for a in range(2, 4)
                                                        for b in range(2, 4)
                                                        for _ in range(1)],
            columns=["a", "b", "c"],
        )

        test = train[train["c"] == 1]
        assert len(test["c"].unique()) == 1

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        _, auc = roc_auc(bn, test, "c")
        assert math.isclose(auc, 1, abs_tol=0.01)
Ejemplo n.º 3
0
    def test_roc_of_incorrect_has_fpr_lt_tpr(self):
        """The ROC of incorrect predictions should have FPR < TPR"""

        # regardless of a or b, c=1 is always more likely to varying amounts (to create multiple threshold
        # points in roc curve)
        train = pd.DataFrame(
            [[a, b, 0] for a in range(3) for b in range(3)
             for _ in range(1)] + [[a, b, 1] for a in range(3)
                                   for b in range(3)
                                   for _ in range(a * 1000 + b * 1000 + 1000)],
            columns=["a", "b", "c"],
        )

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        assert np.allclose(bn.cpds["c"].loc[1].values, 1, atol=0.02)

        # in test, c=0 is always more likely (opposite of train)
        test = pd.DataFrame(
            [[a, b, 0] for a in range(3) for b in range(3)
             for _ in range(1000)] + [[a, b, 1] for a in range(3)
                                      for b in range(3) for _ in range(1)],
            columns=["a", "b", "c"],
        )

        roc, _ = roc_auc(bn, test, "c")

        assert len(roc) > 3
        assert all(fpr > tpr for fpr, tpr in roc if tpr not in [0.0, 1.0])
Ejemplo n.º 4
0
    def test_roc_of_accurate_predictions(self):
        """TPR should always be better than FPR for accurate predictions"""

        # equal class (c) weighting to guarantee high ROC expected
        train = pd.DataFrame(
            [[a, b, 0] for a in range(0, 2) for b in range(0, 2)
             for _ in range(10)] + [[a, b, 1] for a in range(0, 2)
                                    for b in range(0, 2)
                                    for _ in range(a * 10 + b * 10 + 1000)] +
            [[a, b, 0] for a in range(2, 4) for b in range(2, 4)
             for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1]
                                                        for a in range(2, 4)
                                                        for b in range(2, 4)
                                                        for _ in range(10)],
            columns=["a", "b", "c"],
        )

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        roc, _ = roc_auc(bn, train, "c")
        assert all(tpr > fpr for fpr, tpr in roc if tpr not in [0.0, 1.0])
Ejemplo n.º 5
0
    def test_auc_of_accurate_predictions(self):
        """AUC of accurate predictions should be 1"""

        # equal class (c) weighting to guarantee high ROC expected
        train = pd.DataFrame(
            [[a, b, 0] for a in range(0, 2) for b in range(0, 2)
             for _ in range(1)] + [[a, b, 1] for a in range(0, 2)
                                   for b in range(0, 2)
                                   for _ in range(a * 10 + b * 10 + 1000)] +
            [[a, b, 0] for a in range(2, 4) for b in range(2, 4)
             for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1]
                                                        for a in range(2, 4)
                                                        for b in range(2, 4)
                                                        for _ in range(1)],
            columns=["a", "b", "c"],
        )

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        _, auc = roc_auc(bn, train, "c")
        assert math.isclose(auc, 1, abs_tol=0.001)
Ejemplo n.º 6
0
    def test_auc_of_incorrect_close_to_zero(self):
        """The AUC of incorrect predictions should be close to zero"""

        # regardless of a or b, c=1 is always more likely to varying amounts (to create multiple threshold
        # points in roc curve)
        train = pd.DataFrame(
            [[a, b, 0] for a in range(3) for b in range(3)
             for _ in range(1)] + [[a, b, 1] for a in range(3)
                                   for b in range(3)
                                   for _ in range(a * 1000 + b * 1000 + 1000)],
            columns=["a", "b", "c"],
        )

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        assert np.allclose(bn.cpds["c"].loc[1].values, 1, atol=0.02)

        # in test, c=0 is always more likely (opposite of train)
        test = pd.DataFrame(
            [[a, b, 0] for a in range(3) for b in range(3)
             for _ in range(1000)] + [[a, b, 1] for a in range(3)
                                      for b in range(3) for _ in range(1)],
            columns=["a", "b", "c"],
        )

        _, auc = roc_auc(bn, test, "c")

        assert math.isclose(auc, 0, abs_tol=0.001)
Ejemplo n.º 7
0
    def test_roc_of_random_has_unit_gradient(self):
        """The ROC curve for random predictions should be a line from (0,0) to (1,1)"""

        # regardless of a or b, c=1 is always more likely to varying amounts (to create multiple threshold
        # points in roc curve)
        train = pd.DataFrame(
            [[a, b, 0] for a in range(3) for b in range(3)
             for _ in range(1)] + [[a, b, 1] for a in range(3)
                                   for b in range(3)
                                   for _ in range(a * 1000 + b * 1000 + 1000)],
            columns=["a", "b", "c"],
        )

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        assert np.allclose(bn.cpds["c"].loc[1].values, 1, atol=0.02)

        test = pd.DataFrame(
            [[a, b, random.randint(0, 1)] for a in range(3) for b in range(3)
             for _ in range(1000)],
            columns=["a", "b", "c"],
        )

        roc, _ = roc_auc(bn, test, "c")

        assert len(roc) > 3
        assert all(math.isclose(a, b, abs_tol=0.03) for a, b in roc)
Ejemplo n.º 8
0
    def test_auc_for_nonnumeric_features(self):
        """AUC of accurate predictions should be 1 even after remapping numbers to strings"""

        # equal class (c) weighting to guarantee high ROC expected
        train = pd.DataFrame(
            [[a, b, 0] for a in range(0, 2) for b in range(0, 2)
             for _ in range(1)] + [[a, b, 1] for a in range(0, 2)
                                   for b in range(0, 2)
                                   for _ in range(a * 10 + b * 10 + 1000)] +
            [[a, b, 0] for a in range(2, 4) for b in range(2, 4)
             for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1]
                                                        for a in range(2, 4)
                                                        for b in range(2, 4)
                                                        for _ in range(1)],
            columns=["a", "b", "c"],
        )

        # remap values in column c
        train["c"] = train["c"].map({0: "f", 1: "g"})

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        _, auc = roc_auc(bn, train, "c")
        assert math.isclose(auc, 1, abs_tol=0.001)
Ejemplo n.º 9
0
    def test_intercept(self, distribution, n_categories):
        graph = StructureModel()
        graph.add_node("A")

        data_noint = generate_categorical_dataframe(
            graph,
            100000,
            distribution,
            noise_scale=0.1,
            n_categories=n_categories,
            seed=10,
            intercept=False,
        )
        data_intercept = generate_categorical_dataframe(
            graph,
            100000,
            distribution,
            noise_scale=0.1,
            n_categories=n_categories,
            seed=10,
            intercept=True,
        )

        assert np.all(~np.isclose(data_intercept.mean(axis=0),
                                  data_noint.mean(axis=0),
                                  atol=0.05,
                                  rtol=0))
Ejemplo n.º 10
0
    def test_number_of_nodes(self, num_nodes):
        """ Length of each row in generated data equals num_nodes """
        graph = StructureModel()
        edges = [(n, n + 1, 1) for n in range(num_nodes - 1)]
        graph.add_weighted_edges_from(edges)

        data = generate_binary_data(graph, 100, seed=10)
        assert all(len(sample) == num_nodes for sample in data)
Ejemplo n.º 11
0
    def test_number_of_columns(self, num_nodes, n_categories):
        """ Length of dataframe is in the correct shape"""
        graph = StructureModel()
        edges = [(n, n + 1, 1) for n in range(num_nodes - 1)]
        graph.add_weighted_edges_from(edges)

        data = generate_categorical_dataframe(graph,
                                              100,
                                              seed=10,
                                              n_categories=n_categories)
        assert data.shape[1] == (num_nodes * n_categories)
Ejemplo n.º 12
0
 def test_baseline_probability_probit(self, graph, distribution):
     """ Test whether probability centered around 50% if no intercept given"""
     graph = StructureModel()
     graph.add_nodes_from(["A"])
     data = generate_binary_data(
         graph,
         1000000,
         distribution=distribution,
         noise_scale=0.1,
         seed=10,
         intercept=False,
     )
     assert 0.45 < data[:, 0].mean() < 0.55
Ejemplo n.º 13
0
 def test_intercept_probability_logit(self, graph, distribution):
     """ Test whether probability is not centered around 50% when using an intercept"""
     graph = StructureModel()
     graph.add_nodes_from(["A"])
     data = generate_binary_data(
         graph,
         1000000,
         distribution=distribution,
         noise_scale=0.1,
         seed=10,
         intercept=True,
     )
     mean_prob = data[:, 0].mean()
     assert not np.isclose(mean_prob, 0.5, atol=0.05)
Ejemplo n.º 14
0
 def test_intercept_probability(self, graph, distribution, n_categories):
     """ Test whether probability is not centered around 50% when using an intercept"""
     graph = StructureModel()
     graph.add_nodes_from(["A"])
     data = generate_categorical_dataframe(
         graph,
         1000000,
         distribution=distribution,
         n_categories=n_categories,
         noise_scale=0.1,
         seed=10,
         intercept=True,
     )
     assert not np.allclose(
         data.mean(axis=0), 1 / n_categories, atol=0.01, rtol=0)
Ejemplo n.º 15
0
 def test_baseline_probability(self, graph, distribution, n_categories):
     """ Test whether probability centered around 50% if no intercept given"""
     graph = StructureModel()
     graph.add_nodes_from(["A"])
     data = generate_categorical_dataframe(
         graph,
         10000,
         distribution=distribution,
         n_categories=n_categories,
         noise_scale=1.0,
         seed=10,
         intercept=False,
     )
     # without intercept, the probabilities should be fairly uniform
     assert np.allclose(data.mean(axis=0),
                        1 / n_categories,
                        atol=0.01,
                        rtol=0)
Ejemplo n.º 16
0
    def test_intercept(self, distribution):
        graph = StructureModel()
        graph.add_node("123")

        data_noint = generate_binary_data(graph,
                                          100000,
                                          distribution,
                                          noise_scale=0,
                                          seed=10,
                                          intercept=False)
        data_intercept = generate_binary_data(graph,
                                              100000,
                                              distribution,
                                              noise_scale=0,
                                              seed=10,
                                              intercept=True)
        assert not np.isclose(data_noint[:, 0].mean(),
                              data_intercept[:, 0].mean())
Ejemplo n.º 17
0
    def test_intercept(self, distribution):
        graph = StructureModel()
        graph.add_node("123")

        data_noint = generate_continuous_data(
            graph,
            n_samples=100000,
            distribution=distribution,
            noise_scale=0,
            seed=10,
            intercept=False,
        )
        data_intercept = generate_continuous_data(
            graph,
            n_samples=100000,
            distribution=distribution,
            noise_scale=0,
            seed=10,
            intercept=True,
        )
        assert not np.isclose(data_noint[:, 0].mean(),
                              data_intercept[:, 0].mean())
        assert np.isclose(data_noint[:, 0].std(), data_intercept[:, 0].std())
Ejemplo n.º 18
0
    def add_to_node(self, sm: StructureModel) -> StructureModel:
        """
        Adds self to a node of a structure model corresponding to
        all indexes in self.idx_group.

        Args:
            sm: The input StructureModel

        Returns:
            Updated StructureModel
        """
        for idx in self.idx_group:
            sm.nodes[idx]["dist_type"] = self
        return sm
Ejemplo n.º 19
0
    def generator(num_nodes, seed, weight=None):
        np.random.seed(seed)

        sm = StructureModel()
        nodes = list("".join(x) for x in product(
            string.ascii_lowercase, string.ascii_lowercase))[:num_nodes]
        np.random.shuffle(nodes)
        sm.add_nodes_from(nodes)

        # one edge:
        sm.add_weighted_edges_from([("aa", "ab", weight)])
        return sm
Ejemplo n.º 20
0
def structToGraph(weightedGraph: StructureModel,
                  nodeColor: Color = LIGHT_BLUE,
                  edgeColor: Color = CHERRY) -> gz.Digraph:

    g = gz.Digraph('G')

    adjacencies: List[Tuple[Name, Dict[Name, OriginAndWeightInfo]]] = list(
        weightedGraph.adjacency())

    for sourceVar, edgeDict in adjacencies:
        edgeList: List[Name, OriginAndWeightInfo] = list(edgeDict.items())

        for endVar, otherInfoDict in edgeList:
            g.attr('node', shape='oval')  #, color='red')

            g.node(sourceVar,
                   sourceVar)  # name, label   # variables[head]['desc'])
            g.node(endVar, endVar)  # name, label
            g.node_attr.update(style='filled',
                               gradientangle='90',
                               penwidth='1',
                               fillcolor=nodeColor + ":white",
                               color=edgeColor,
                               fontsize='12',
                               fontpath=ACME_FONT_PATH,
                               fontname=ACME_FONT_NAME)  # + '.otf')

            # Setting weighted edge here (if present)
            if 'weight' in otherInfoDict.keys():
                g.edge(tail_name=sourceVar,
                       head_name=endVar,
                       label=str(otherInfoDict['weight']))
            else:
                g.edge(tail_name=sourceVar, head_name=endVar)

            g.edge_attr.update(color=edgeColor,
                               penwidth='1',
                               fontsize='10',
                               fontpath=PLAY_FONT_NAME,
                               fontname=PLAY_FONT_NAME)

    return g
Ejemplo n.º 21
0
    def test_incorrect_weight_dist(self):
        sm = StructureModel()
        nodes = list(str(x) for x in range(6))
        np.random.shuffle(nodes)
        sm.add_nodes_from(nodes)

        sm.add_weighted_edges_from([("0", "1", None), ("2", "4", None)])

        with pytest.raises(ValueError, match="Unknown weight distribution"):
            _ = sem_generator(
                graph=sm,
                schema=None,
                default_type="continuous",
                distributions={"weight": "unknown"},
                noise_std=2.0,
                n_samples=1000,
                intercept=False,
                seed=10,
            )
Ejemplo n.º 22
0
def _learn_structure_lasso(
    X: np.ndarray,
    beta: float,
    bnds,
    max_iter: int = 100,
    h_tol: float = 1e-8,
    w_threshold: float = 0.0,
) -> StructureModel:
    """
    Based on initial implementation at https://github.com/xunzheng/notears
    """
    def _h(w_vec: np.ndarray) -> float:
        """
        Constraint function of the NOTEARS algorithm with lasso regularisation.

        Args:
            w_vec:  weight vector (wpos and wneg).

        Returns:
            float: DAGness of the adjacency matrix (0 == DAG, >0 == cyclic).
        """

        W = w_vec.reshape([d, d])
        return np.trace(slin.expm(W * W)) - d

    def _func(w_vec: np.ndarray) -> float:
        """
        Objective function that the NOTEARS algorithm with lasso regularisation tries to minimise.

        Args:
            w_vec: weight vector (wpos and wneg).

        Returns:
            float: objective.
        """

        w_pos = w_vec[:d**2]
        w_neg = w_vec[d**2:]

        wmat_pos = w_pos.reshape([d, d])
        wmat_neg = w_neg.reshape([d, d])

        wmat = wmat_pos - wmat_neg
        loss = 0.5 / n * np.square(
            np.linalg.norm(X.dot(np.eye(d, d) - wmat), "fro"))
        h_val = _h(wmat)
        return loss + 0.5 * rho * h_val * h_val + alpha * h_val + beta * w_vec.sum(
        )

    def _grad(w_vec: np.ndarray) -> np.ndarray:
        """
        Gradient function used to compute next step in NOTEARS algorithm with lasso regularisation.

        Args:
            w_vec: weight vector (wpos and wneg).

        Returns:
            np.ndarray: gradient vector.
        """

        w_pos = w_vec[:d**2]
        w_neg = w_vec[d**2:]

        grad_vec = np.zeros(2 * d**2)
        wmat_pos = w_pos.reshape([d, d])
        wmat_neg = w_neg.reshape([d, d])

        wmat = wmat_pos - wmat_neg

        loss_grad = -1.0 / n * X.T.dot(X).dot(np.eye(d, d) - wmat)
        exp_hdmrd = slin.expm(wmat * wmat)
        obj_grad = (
            loss_grad +
            (rho * (np.trace(exp_hdmrd) - d) + alpha) * exp_hdmrd.T * wmat * 2)
        lbd_grad = beta * np.ones(d * d)
        grad_vec[:d**2] = obj_grad.flatten() + lbd_grad
        grad_vec[d**2:] = -obj_grad.flatten() + lbd_grad

        return grad_vec

    if X.size == 0:
        raise ValueError("Input data X is empty, cannot learn any structure")
    logging.info(
        "Learning structure using 'NOTEARS' optimisation with lasso regularisation."
    )

    n, d = X.shape
    w_est, w_new = np.zeros(2 * d * d), np.zeros(2 * d * d)
    rho, alpha, h_val, h_new = 1.0, 0.0, np.inf, np.inf
    for n_iter in range(max_iter):
        while rho < 1e20:
            sol = sopt.minimize(_func,
                                w_est,
                                method="L-BFGS-B",
                                jac=_grad,
                                bounds=bnds)
            w_new = sol.x

            h_new = _h(w_new[:d**2].reshape([d, d]) -
                       w_new[d**2:].reshape([d, d]))
            if h_new > 0.25 * h_val:
                rho *= 10
            else:
                break
        w_est, h_val = w_new, h_new
        alpha += rho * h_val
        if h_val <= h_tol:
            break
        if h_val > h_tol and n_iter == max_iter - 1:
            warnings.warn("Failed to converge. Consider increasing max_iter.")

    w_new = w_est[:d**2].reshape([d, d]) - w_est[d**2:].reshape([d, d])
    w_new[np.abs(w_new) < w_threshold] = 0
    return StructureModel(w_new.reshape([d, d]))
Ejemplo n.º 23
0
def _learn_structure(
    X: np.ndarray,
    bnds,
    max_iter: int = 100,
    h_tol: float = 1e-8,
    w_threshold: float = 0.0,
) -> StructureModel:
    """
    Based on initial implementation at https://github.com/xunzheng/notears
    """
    def _h(w: np.ndarray) -> float:
        """
        Constraint function of the NOTEARS algorithm.

        Args:
            w:  current adjacency matrix.

        Returns:
            float: DAGness of the adjacency matrix (0 == DAG, >0 == cyclic).
        """

        W = w.reshape([d, d])
        return np.trace(slin.expm(W * W)) - d

    def _func(w: np.ndarray) -> float:
        """
        Objective function that the NOTEARS algorithm tries to minimise.

        Args:
            w: current adjacency matrix.

        Returns:
            float: objective.
        """

        W = w.reshape([d, d])
        loss = 0.5 / n * np.square(
            np.linalg.norm(X.dot(np.eye(d, d) - W), "fro"))
        h = _h(W)
        return loss + 0.5 * rho * h * h + alpha * h

    def _grad(w: np.ndarray) -> np.ndarray:
        """
        Gradient function used to compute next step in NOTEARS algorithm.

        Args:
            w: the current adjacency matrix.

        Returns:
            np.ndarray: gradient vector.
        """

        W = w.reshape([d, d])
        loss_grad = -1.0 / n * X.T.dot(X).dot(np.eye(d, d) - W)
        E = slin.expm(W * W)
        obj_grad = loss_grad + (rho * (np.trace(E) - d) + alpha) * E.T * W * 2
        return obj_grad.flatten()

    if X.size == 0:
        raise ValueError("Input data X is empty, cannot learn any structure")
    logging.info("Learning structure using 'NOTEARS' optimisation.")

    # n examples, d properties
    n, d = X.shape
    # initialise matrix to zeros
    w_est, w_new = np.zeros(d * d), np.zeros(d * d)

    # initialise weights and constraints
    rho, alpha, h, h_new = 1.0, 0.0, np.inf, np.inf

    # start optimisation
    for n_iter in range(max_iter):
        while rho < 1e20:
            sol = sopt.minimize(_func,
                                w_est,
                                method="L-BFGS-B",
                                jac=_grad,
                                bounds=bnds)
            w_new = sol.x
            h_new = _h(w_new)
            if h_new > 0.25 * h:
                rho *= 10
            else:
                break
        w_est, h = w_new, h_new
        alpha += rho * h
        if h <= h_tol:
            break
        if h > h_tol and n_iter == max_iter - 1:
            warnings.warn("Failed to converge. Consider increasing max_iter.")

    w_est[np.abs(w_est) <= w_threshold] = 0
    return StructureModel(w_est.reshape([d, d]))
Ejemplo n.º 24
0
def from_pandas_lasso(
    X: pd.DataFrame,
    beta: float,
    max_iter: int = 100,
    h_tol: float = 1e-8,
    w_threshold: float = 0.0,
    tabu_edges: List[Tuple[str, str]] = None,
    tabu_parent_nodes: List[str] = None,
    tabu_child_nodes: List[str] = None,
) -> StructureModel:
    """
    Learn the `StructureModel`, the graph structure with lasso regularisation
    describing conditional dependencies between variables in data presented as a pandas dataframe.

    Based on DAGs with NO TEARS.
    @inproceedings{zheng2018dags,
        author = {Zheng, Xun and Aragam, Bryon and Ravikumar, Pradeep and Xing, Eric P.},
        booktitle = {Advances in Neural Information Processing Systems},
        title = {{DAGs with NO TEARS: Continuous Optimization for Structure Learning}},
        year = {2018},
        codebase = {https://github.com/xunzheng/notears}
    }

    Args:
        X: input data.
        beta: Constant that multiplies the lasso term.
        max_iter: max number of dual ascent steps during optimisation.
        h_tol: exit if h(W) < h_tol (as opposed to strict definition of 0).
        w_threshold: fixed threshold for absolute edge weights.
        tabu_edges: list of edges(from, to) not to be included in the graph.
        tabu_parent_nodes: list of nodes banned from being a parent of any other nodes.
        tabu_child_nodes: list of nodes banned from being a child of any other nodes.

    Returns:
         StructureModel: graph of conditional dependencies between data variables.

    Raises:
        ValueError: If X does not contain data.
    """

    data = deepcopy(X)

    non_numeric_cols = data.select_dtypes(exclude="number").columns

    if not non_numeric_cols.empty:
        raise ValueError(
            "All columns must have numeric data. "
            "Consider mapping the following columns to int {non_numeric_cols}".
            format(non_numeric_cols=non_numeric_cols))

    col_idx = {c: i for i, c in enumerate(data.columns)}
    idx_col = {i: c for c, i in col_idx.items()}

    if tabu_edges:
        tabu_edges = [(col_idx[u], col_idx[v]) for u, v in tabu_edges]
    if tabu_parent_nodes:
        tabu_parent_nodes = [col_idx[n] for n in tabu_parent_nodes]
    if tabu_child_nodes:
        tabu_child_nodes = [col_idx[n] for n in tabu_child_nodes]

    g = from_numpy_lasso(
        data.values,
        beta,
        max_iter,
        h_tol,
        w_threshold,
        tabu_edges,
        tabu_parent_nodes,
        tabu_child_nodes,
    )

    sm = StructureModel()
    sm.add_nodes_from(data.columns)
    sm.add_weighted_edges_from(
        [(idx_col[u], idx_col[v], w) for u, v, w in g.edges.data("weight")],
        origin="learned",
    )

    return sm
Ejemplo n.º 25
0
def from_pandas(
    X: pd.DataFrame,
    max_iter: int = 100,
    h_tol: float = 1e-8,
    w_threshold: float = 0.0,
    tabu_edges: List[Tuple[str, str]] = None,
    tabu_parent_nodes: List[str] = None,
    tabu_child_nodes: List[str] = None,
) -> StructureModel:
    """
    Learn the `StructureModel`, the graph structure describing conditional dependencies between variables
    in data presented as a pandas dataframe.

    The optimisation is to minimise a score function :math:`F(W)` over the graph's
    weighted adjacency matrix, :math:`W`, subject to the a constraint function :math:`h(W)`,
    where :math:`h(W) == 0` characterises an acyclic graph.
    :math:`h(W) > 0` is a continuous, differentiable function that encapsulated how acyclic the graph is
    (less == more acyclic).
    Full details of this approach to structure learning are provided in the publication:

    Based on DAGs with NO TEARS.
    @inproceedings{zheng2018dags,
        author = {Zheng, Xun and Aragam, Bryon and Ravikumar, Pradeep and Xing, Eric P.},
        booktitle = {Advances in Neural Information Processing Systems},
        title = {{DAGs with NO TEARS: Continuous Optimization for Structure Learning}},
        year = {2018},
        codebase = {https://github.com/xunzheng/notears}
    }

    Args:
        X: input data.
        max_iter: max number of dual ascent steps during optimisation.
        h_tol: exit if h(W) < h_tol (as opposed to strict definition of 0).
        w_threshold: fixed threshold for absolute edge weights.
        tabu_edges: list of edges(from, to) not to be included in the graph.
        tabu_parent_nodes: list of nodes banned from being a parent of any other nodes.
        tabu_child_nodes: list of nodes banned from being a child of any other nodes.

    Returns:
         StructureModel: graph of conditional dependencies between data variables.

    Raises:
        ValueError: If X does not contain data.
    """

    data = deepcopy(X)

    non_numeric_cols = data.select_dtypes(exclude="number").columns

    if len(non_numeric_cols) > 0:
        raise ValueError(
            "All columns must have numeric data. "
            "Consider mapping the following columns to int {non_numeric_cols}".
            format(non_numeric_cols=non_numeric_cols))

    col_idx = {c: i for i, c in enumerate(data.columns)}
    idx_col = {i: c for c, i in col_idx.items()}

    if tabu_edges:
        tabu_edges = [(col_idx[u], col_idx[v]) for u, v in tabu_edges]
    if tabu_parent_nodes:
        tabu_parent_nodes = [col_idx[n] for n in tabu_parent_nodes]
    if tabu_child_nodes:
        tabu_child_nodes = [col_idx[n] for n in tabu_child_nodes]

    g = from_numpy(
        data.values,
        max_iter,
        h_tol,
        w_threshold,
        tabu_edges,
        tabu_parent_nodes,
        tabu_child_nodes,
    )

    sm = StructureModel()
    sm.add_nodes_from(data.columns)
    sm.add_weighted_edges_from(
        [(idx_col[u], idx_col[v], w) for u, v, w in g.edges.data("weight")],
        origin="learned",
    )

    return sm
Ejemplo n.º 26
0
def from_numpy(X: np.ndarray,
               dist_type_schema: Dict[int, str] = None,
               lasso_beta: float = 0.0,
               ridge_beta: float = 0.0,
               use_bias: bool = False,
               hidden_layer_units: Iterable[int] = None,
               w_threshold: float = None,
               max_iter: int = 100,
               tabu_edges: List[Tuple[int, int]] = None,
               tabu_parent_nodes: List[int] = None,
               tabu_child_nodes: List[int] = None,
               **kwargs) -> StructureModel:
    """
    Learn the `StructureModel`, the graph structure with lasso regularisation
    describing conditional dependencies between variables in data presented as a numpy array.

    Based on DAGs with NO TEARS.
    @inproceedings{zheng2018dags,
        author = {Zheng, Xun and Aragam, Bryon and Ravikumar, Pradeep and Xing, Eric P.},
        booktitle = {Advances in Neural Information Processing Systems},
        title = {{DAGs with NO TEARS: Continuous Optimization for Structure Learning}},
        year = {2018},
        codebase = {https://github.com/xunzheng/notears}
    }

    Args:
        X: 2d input data, axis=0 is data rows, axis=1 is data columns. Data must be row oriented.

        dist_type_schema: The dist type schema corresponding to the passed in data X.
        It maps the positional column in X to the string alias of a dist type.
        A list of alias names can be found in ``dist_type/__init__.py``.
        If None, assumes that all data in X is continuous.

        lasso_beta: Constant that multiplies the lasso term (l1 regularisation).
        NOTE when using nonlinearities, the l1 loss only applies to the dag_layer.

        use_bias: Whether to fit a bias parameter in the NOTEARS algorithm.

        ridge_beta: Constant that multiplies the ridge term (l2 regularisation).
        When using nonlinear layers use of this parameter is recommended.

        hidden_layer_units: An iterable where its length determine the number of layers used,
        and the numbers determine the number of nodes used for the layer in order.

        w_threshold: fixed threshold for absolute edge weights.

        max_iter: max number of dual ascent steps during optimisation.

        tabu_edges: list of edges(from, to) not to be included in the graph.

        tabu_parent_nodes: list of nodes banned from being a parent of any other nodes.

        tabu_child_nodes: list of nodes banned from being a child of any other nodes.

        **kwargs: additional arguments for NOTEARS MLP model

    Returns:
        StructureModel: a graph of conditional dependencies between data variables.

    Raises:
        ValueError: If X does not contain data.
        ValueError: If schema does not correspond to columns.
    """
    # n examples, d properties
    if not X.size:
        raise ValueError("Input data X is empty, cannot learn any structure")
    logging.info("Learning structure using 'NOTEARS' optimisation.")
    # Check array for NaN or inf values
    check_array(X)

    if dist_type_schema is not None:

        # make sure that there is one provided key per column
        if set(range(X.shape[1])).symmetric_difference(
                set(dist_type_schema.keys())):
            raise ValueError(
                "Difference indices and expected indices. Got {} schema".
                format(dist_type_schema))

    # if dist_type_schema is None, assume all columns are continuous, else init the alias mapped object
    dist_types = (
        [DistTypeContinuous(idx=idx)
         for idx in np.arange(X.shape[1])] if dist_type_schema is None else [
             dist_type_aliases[alias](idx=idx)
             for idx, alias in dist_type_schema.items()
         ])

    # shape of X before preprocessing
    _, d_orig = X.shape
    # perform dist type pre-processing (i.e. column expansion)
    for dist_type in dist_types:
        # NOTE: preprocess_X must be called first to perform possible column expansions
        X = dist_type.preprocess_X(X)
        tabu_edges = dist_type.preprocess_tabu_edges(tabu_edges)
        tabu_parent_nodes = dist_type.preprocess_tabu_nodes(tabu_parent_nodes)
        tabu_child_nodes = dist_type.preprocess_tabu_nodes(tabu_child_nodes)
    # shape of X after preprocessing
    _, d = X.shape

    # if None or empty, convert into a list with single item
    if hidden_layer_units is None:
        hidden_layer_units = [0]
    elif isinstance(hidden_layer_units, list) and not hidden_layer_units:
        hidden_layer_units = [0]

    # if no hidden layer units, still take 1 iteration step with bounds
    hidden_layer_bnds = hidden_layer_units[0] if hidden_layer_units[0] else 1

    # Flip i and j because Pytorch flattens the vector in another direction
    bnds = [
        (0, 0) if i == j else
        (0, 0) if tabu_edges is not None and (i, j) in tabu_edges else
        (0, 0) if tabu_parent_nodes is not None and i in tabu_parent_nodes else
        (0, 0) if tabu_child_nodes is not None and j in tabu_child_nodes else
        (None, None) for j in range(d) for _ in range(hidden_layer_bnds)
        for i in range(d)
    ]

    model = NotearsMLP(n_features=d,
                       dist_types=dist_types,
                       hidden_layer_units=hidden_layer_units,
                       lasso_beta=lasso_beta,
                       ridge_beta=ridge_beta,
                       bounds=bnds,
                       use_bias=use_bias,
                       **kwargs)

    model.fit(X, max_iter=max_iter)
    sm = StructureModel(model.adj)
    if w_threshold:
        sm.remove_edges_below_threshold(w_threshold)

    # extract the mean effect and add as edge attribute
    mean_effect = model.adj_mean_effect
    for u, v, edge_dict in sm.edges.data(True):
        sm.add_edge(
            u,
            v,
            origin="learned",
            weight=edge_dict["weight"],
            mean_effect=mean_effect[u, v],
        )

    # set bias as node attribute
    bias = model.bias
    for node in sm.nodes():
        value = None
        if bias is not None:
            value = bias[node]
        sm.nodes[node]["bias"] = value

    # attach each dist_type object to corresponding node(s)
    for dist_type in dist_types:
        sm = dist_type.add_to_node(sm)

    # preserve the structure_learner as a graph attribute
    sm.graph["structure_learner"] = model

    # collapse the adj down and store as graph attr
    adj = deepcopy(model.adj)
    for dist_type in dist_types:
        adj = dist_type.collapse_adj(adj)
    sm.graph["graph_collapsed"] = StructureModel(adj[:d_orig, :d_orig])

    return sm
Ejemplo n.º 27
0
def from_pandas(X: pd.DataFrame,
                dist_type_schema: Dict[Union[str, int], str] = None,
                lasso_beta: float = 0.0,
                ridge_beta: float = 0.0,
                use_bias: bool = False,
                hidden_layer_units: Iterable[int] = None,
                max_iter: int = 100,
                w_threshold: float = None,
                tabu_edges: List[Tuple[str, str]] = None,
                tabu_parent_nodes: List[str] = None,
                tabu_child_nodes: List[str] = None,
                **kwargs) -> StructureModel:
    """
    Learn the `StructureModel`, the graph structure describing conditional dependencies between variables
    in data presented as a pandas dataframe.

    The optimisation is to minimise a score function :math:`F(W)` over the graph's
    weighted adjacency matrix, :math:`W`, subject to the a constraint function :math:`h(W)`,
    where :math:`h(W) == 0` characterises an acyclic graph.
    :math:`h(W) > 0` is a continuous, differentiable function that encapsulated how acyclic the graph is
    (less == more acyclic).
    Full details of this approach to structure learning are provided in the publication:

    Based on DAGs with NO TEARS.
    @inproceedings{zheng2018dags,
        author = {Zheng, Xun and Aragam, Bryon and Ravikumar, Pradeep and Xing, Eric P.},
        booktitle = {Advances in Neural Information Processing Systems},
        title = {{DAGs with NO TEARS: Continuous Optimization for Structure Learning}},
        year = {2018},
        codebase = {https://github.com/xunzheng/notears}
    }

    Args:
        X: 2d input data, axis=0 is data rows, axis=1 is data columns. Data must be row oriented.

        dist_type_schema: The dist type schema corresponding to the passed in data X.
        It maps the pandas column name in X to the string alias of a dist type.
        A list of alias names can be found in ``dist_type/__init__.py``.
        If None, assumes that all data in X is continuous.

        lasso_beta: Constant that multiplies the lasso term (l1 regularisation).
        NOTE when using nonlinearities, the l1 loss only applies to the dag_layer.

        use_bias: Whether to fit a bias parameter in the NOTEARS algorithm.

        ridge_beta: Constant that multiplies the ridge term (l2 regularisation).
        When using nonlinear layers use of this parameter is recommended.

        hidden_layer_units: An iterable where its length determine the number of layers used,
        and the numbers determine the number of nodes used for the layer in order.

        w_threshold: fixed threshold for absolute edge weights.

        max_iter: max number of dual ascent steps during optimisation.

        tabu_edges: list of edges(from, to) not to be included in the graph.

        tabu_parent_nodes: list of nodes banned from being a parent of any other nodes.

        tabu_child_nodes: list of nodes banned from being a child of any other nodes.

        **kwargs: additional arguments for NOTEARS MLP model

    Returns:
         StructureModel: graph of conditional dependencies between data variables.

    Raises:
        ValueError: If X does not contain data.
    """

    data = deepcopy(X)

    # if dist_type_schema is not None, convert dist_type_schema from cols to idx
    dist_type_schema = (dist_type_schema if dist_type_schema is None else {
        X.columns.get_loc(col): alias
        for col, alias in dist_type_schema.items()
    })

    non_numeric_cols = data.select_dtypes(exclude="number").columns

    if len(non_numeric_cols) > 0:
        raise ValueError(
            "All columns must have numeric data. "
            "Consider mapping the following columns to int {non_numeric_cols}".
            format(non_numeric_cols=non_numeric_cols))

    col_idx = {c: i for i, c in enumerate(data.columns)}
    idx_col = {i: c for c, i in col_idx.items()}

    if tabu_edges:
        tabu_edges = [(col_idx[u], col_idx[v]) for u, v in tabu_edges]
    if tabu_parent_nodes:
        tabu_parent_nodes = [col_idx[n] for n in tabu_parent_nodes]
    if tabu_child_nodes:
        tabu_child_nodes = [col_idx[n] for n in tabu_child_nodes]

    g = from_numpy(X=data.values,
                   dist_type_schema=dist_type_schema,
                   lasso_beta=lasso_beta,
                   ridge_beta=ridge_beta,
                   use_bias=use_bias,
                   hidden_layer_units=hidden_layer_units,
                   w_threshold=w_threshold,
                   max_iter=max_iter,
                   tabu_edges=tabu_edges,
                   tabu_parent_nodes=tabu_parent_nodes,
                   tabu_child_nodes=tabu_child_nodes,
                   **kwargs)

    # set comprehension to ensure only unique dist types are extraced
    # NOTE: this prevents double-renaming caused by the same dist type used on expanded columns
    unique_dist_types = {node[1]["dist_type"] for node in g.nodes(data=True)}
    # use the dist types to update the idx_col mapping
    idx_col_expanded = deepcopy(idx_col)
    for dist_type in unique_dist_types:
        idx_col_expanded = dist_type.update_idx_col(idx_col_expanded)

    sm = StructureModel()
    # add expanded set of nodes
    sm.add_nodes_from(list(idx_col_expanded.values()))

    # recover the edge weights from g
    for u, v, edge_dict in g.edges.data(True):
        sm.add_edge(
            idx_col_expanded[u],
            idx_col_expanded[v],
            origin="learned",
            weight=edge_dict["weight"],
            mean_effect=edge_dict["mean_effect"],
        )

    # retrieve all graphs attrs
    for key, val in g.graph.items():
        sm.graph[key] = val

    # recover the node biases from g
    for node in g.nodes(data=True):
        node_name = idx_col_expanded[node[0]]
        sm.nodes[node_name]["bias"] = node[1]["bias"]

    # recover and preseve the node dist_types
    for node_data in g.nodes(data=True):
        node_name = idx_col_expanded[node_data[0]]
        sm.nodes[node_name]["dist_type"] = node_data[1]["dist_type"]

    # recover the collapsed model from g
    sm_collapsed = StructureModel()
    sm_collapsed.add_nodes_from(list(idx_col.values()))
    for u, v, edge_dict in g.graph["graph_collapsed"].edges.data(True):
        sm_collapsed.add_edge(
            idx_col[u],
            idx_col[v],
            origin="learned",
            weight=edge_dict["weight"],
        )
    sm.graph["graph_collapsed"] = sm_collapsed

    return sm
Ejemplo n.º 28
0
def _generate_inter_structure(
    num_nodes: int,
    p: int,
    degree: float,
    graph_type: str,
    w_min: float,
    w_max: float,
    w_decay: float = 1.0,
    neg: float = 0.5,
) -> StructureModel:
    """Simulate random DAG between two time slices.

    Args:
        num_nodes: number of nodes per slice
        p: number of slices that influence current slice
        degree: expected in-degree of current time slice
        graph_type: {'erdos-renyi' 'full'}
        w_min: minimum weight for inter-slice nodes
        w_max: maximum weight for inter-slice nodes
        w_decay: exponent of weights decay for slices that are farther apart. Default is 1.0, which implies no decay
        neg: the proportion of edge weights expected to be negative. By default, 50% of the edges are expected
            to be negative weight (`neg == 0.5`).

    Returns:
        G_inter: weighted, bipartite DAG for inter-slice connections

    Raises:
        ValueError: if graph type not known
    """
    if w_min > w_max:
        raise ValueError(
            "Absolute minimum weight must be less than or equal to maximum weight: "
            f"{w_min} > {w_max}")

    if graph_type == "erdos-renyi":
        prob = degree / num_nodes
        b = (np.random.rand(p * num_nodes, num_nodes) < prob).astype(float)
    elif graph_type == "full":  # ignore degree, only for experimental use
        b = np.ones([p * num_nodes, num_nodes])
    else:
        raise ValueError(f"Unknown inter-slice graph type `{graph_type}`. "
                         "Valid types are 'erdos-renyi' and 'full'")
    u = []
    for i in range(p):
        u_i = np.random.uniform(
            low=w_min, high=w_max, size=[num_nodes, num_nodes]) / (w_decay**i)
        u_i[np.random.rand(num_nodes, num_nodes) < neg] *= -1
        u.append(u_i)

    u = np.concatenate(u, axis=0) if u else np.empty(b.shape)
    a = (b != 0).astype(float) * u

    df = pd.DataFrame(
        a,
        index=[
            f"{var}_lag{l_val}" for l_val in range(1, p + 1)
            for var in range(num_nodes)
        ],
        columns=[f"{var}_lag0" for var in range(num_nodes)],
    )
    idxs, cols = list(df.index), list(df.columns)
    for i in idxs:
        df[i] = 0
    for i in cols:
        df.loc[i, :] = 0

    g_inter = StructureModel(df)
    return g_inter
Ejemplo n.º 29
0
def generate_structure_dynamic(  # pylint: disable=too-many-arguments
    num_nodes: int,
    p: int,
    degree_intra: float,
    degree_inter: float,
    graph_type_intra: str = "erdos-renyi",
    graph_type_inter: str = "erdos-renyi",
    w_min_intra: float = 0.5,
    w_max_intra: float = 0.5,
    w_min_inter: float = 0.5,
    w_max_inter: float = 0.5,
    w_decay: float = 1.0,
) -> StructureModel:
    """
    Generates a dynamic DAG at random.

    Args:
        num_nodes: Number of nodes
        p: maximum lag to be considered in the structure
        degree_intra: expected degree on nodes from the current state
        degree_inter: expected degree on nodes from the lagged nodes
        graph_type_intra:
            - erdos-renyi: constructs a graph such that the probability of any given edge is degree / (num_nodes - 1)
            - barabasi-albert: constructs a scale-free graph from an initial connected graph of (degree / 2) nodes
            - full: constructs a fully-connected graph - degree has no effect
        graph_type_inter:
            - erdos-renyi: constructs a graph such that the probability of any given edge is degree / (num_nodes - 1)
            - full: connect all past nodes to all present nodes
        w_min_intra: minimum weight for intra-slice nodes
        w_max_intra: maximum weight for intra-slice nodes
        w_min_inter: minimum weight for inter-slice nodes
        w_max_inter: maximum weight for inter-slice nodes
        w_decay: exponent of weights decay for slices that are farther apart. Default is 1.0, which implies no decay

    Raises:
        ValueError: if graph type unknown or `num_nodes < 2`

    Returns:
        StructureModel containing all simulated nodes and edges (intra- and inter-slice)
    """
    sm_intra = generate_structure(
        num_nodes=num_nodes,
        degree=degree_intra,
        graph_type=graph_type_intra,
        w_min=w_min_intra,
        w_max=w_max_intra,
    )
    sm_inter = _generate_inter_structure(
        num_nodes=num_nodes,
        p=p,
        degree=degree_inter,
        graph_type=graph_type_inter,
        w_min=w_min_inter,
        w_max=w_max_inter,
        w_decay=w_decay,
    )
    res = StructureModel()
    res.add_nodes_from(sm_inter.nodes)
    res.add_nodes_from([f"{u}_lag0" for u in sm_intra.nodes])
    res.add_weighted_edges_from(sm_inter.edges.data("weight"))
    res.add_weighted_edges_from([(f"{u}_lag0", f"{v}_lag0", w)
                                 for u, v, w in sm_intra.edges.data("weight")])
    return res
Ejemplo n.º 30
0
def naive_bayes_plus_parents(
    categories: int = 3,
    samples: int = 500,
    parents: int = 3,
    children: int = 3,
    p_z: float = 0.9,
    p_c: float = 0.9,
    percentage_not_missing: float = 0,
    seed: int = 22,
) -> Tuple[pd.DataFrame, StructureModel, Dict, np.array]:
    """
    p0 ... pn
     \\  |  /
        z
     /  |  \\
    c0 ... cm

    z = mode of parents with probability p_z, otherwise mode of parents + 1 mod n_categories
    c0 = z with prob. p_c, otherwise it is z + 1 mod n_categories
    if no p are give, sample z from the categories uniformly

    Args:
        categories: number of categories
        samples: number of samples
        parents: number of parents, n as shown above
        children: number of children, m as above
        p_z: probability that z = mode(parents)
        p_c: probability that children equals parent
        percentage_not_missing: percentage of the LV that is provided. The default is 0, i.e. the LV is not observed
        seed: seed for random generator

    Returns:
        data: sampled pandas dataframe, missing data on z
        sm: structure model
        node_states: dictionary of list of states for each node
        true_lv_values: true values of latent variable
    """

    def mode(lst: Iterable) -> Any:
        return Counter(lst).most_common()[0][0] if len(lst) > 0 else np.nan

    np.random.seed(seed)
    par_samples = np.random.choice(categories, size=[samples, parents])

    if parents == 0:
        true_lv_values = np.random.choice(categories, size=[samples, 1])
    else:
        true_lv_values = np.array(
            [
                [(mode(el) + np.random.choice(2, p=[p_z, 1 - p_z])) % categories]
                for el in par_samples
            ]
        )

    child_samples = np.random.random(size=[samples, children])
    aux = true_lv_values.repeat(children, axis=1)
    child_samples = np.where(child_samples < p_c, aux, (aux + 1) % categories)

    df = pd.concat(
        [
            pd.DataFrame(par_samples, columns=[f"p_{i}" for i in range(parents)]),
            pd.DataFrame(child_samples, columns=[f"c_{i}" for i in range(children)]),
            pd.DataFrame(true_lv_values, columns=["z"]),
        ],
        axis=1,
    )
    df.loc[int(samples * percentage_not_missing) :, "z"] = np.nan

    sm = StructureModel()
    sm.add_edges_from([(f"p_{i}", "z") for i in range(parents)])
    sm.add_edges_from([("z", f"c_{i}") for i in range(children)])

    node_states = {"z": list(range(categories))}

    for i in range(parents):
        node_states[f"p_{i}"] = list(range(categories))
    for i in range(children):
        node_states[f"c_{i}"] = list(range(categories))

    return df, sm, node_states, true_lv_values