Esempio n. 1
0
 def test_bad_distribution_type(self):
     """Test that invalid sem-type other than "probit", "normal", "logit" is not accepted"""
     graph_type, degree, d_nodes = "erdos-renyi", 4, 10
     sm = generate_structure(d_nodes, degree, graph_type)
     with pytest.raises(ValueError, match="Unknown binary distribution"):
         generate_binary_data(sm,
                              distribution="invalid",
                              n_samples=10,
                              seed=10)
Esempio n. 2
0
    def test_f1score_generated_binary(self):
        """ Binary strucutre learned should have good f1 score """
        np.random.seed(10)
        sm = generate_structure(5, 2.0)
        df = generate_binary_data(sm,
                                  1000,
                                  intercept=False,
                                  noise_scale=0.1,
                                  seed=10)

        dist_type_schema = {i: "bin" for i in range(df.shape[1])}
        sm_fitted = from_numpy(
            df,
            dist_type_schema=dist_type_schema,
            lasso_beta=0.1,
            ridge_beta=0.0,
            w_threshold=0.1,
            use_bias=False,
        )

        right_edges = sm.edges
        n_predictions_made = len(sm_fitted.edges)
        n_correct_predictions = len(
            set(sm_fitted.edges).intersection(set(right_edges)))
        n_relevant_predictions = len(right_edges)

        precision = n_correct_predictions / n_predictions_made
        recall = n_correct_predictions / n_relevant_predictions
        f1_score = 2 * (precision * recall) / (precision + recall)

        assert f1_score > 0.8
Esempio n. 3
0
    def test_dataframe(self, graph, distribution, noise_std, intercept, seed,
                       kernel):
        """
        Tests equivalence of dataframe wrapper
        """
        data = generate_binary_data(
            graph,
            100,
            distribution,
            noise_scale=noise_std,
            seed=seed,
            intercept=intercept,
            kernel=kernel,
        )
        df = generate_binary_dataframe(
            graph,
            100,
            distribution,
            noise_scale=noise_std,
            seed=seed,
            intercept=intercept,
            kernel=kernel,
        )

        assert np.array_equal(data, df[list(graph.nodes())].values)
Esempio n. 4
0
    def test_number_of_nodes(self, num_nodes):
        """Length of each row in generated data equals num_nodes"""
        graph = StructureModel()
        edges = [(n, n + 1, 1) for n in range(num_nodes - 1)]
        graph.add_weighted_edges_from(edges)

        data = generate_binary_data(graph, 100, seed=10)
        assert all(len(sample) == num_nodes for sample in data)
Esempio n. 5
0
 def test_returns_ndarray(self, distribution):
     """Return value is an ndarray - test over all sem_types"""
     graph_type, degree, d_nodes = "erdos-renyi", 4, 10
     sm = generate_structure(d_nodes, degree, graph_type)
     ndarray = generate_binary_data(sm,
                                    distribution=distribution,
                                    n_samples=10)
     assert isinstance(ndarray, np.ndarray)
Esempio n. 6
0
    def test_intercept(self, distribution):
        graph = StructureModel()
        graph.add_node("123")

        data_noint = generate_binary_data(graph,
                                          100000,
                                          distribution,
                                          noise_scale=0,
                                          seed=10,
                                          intercept=False)
        data_intercept = generate_binary_data(graph,
                                              100000,
                                              distribution,
                                              noise_scale=0,
                                              seed=10,
                                              intercept=True)
        assert not np.isclose(data_noint[:, 0].mean(),
                              data_intercept[:, 0].mean())
Esempio n. 7
0
 def test_baseline_probability_probit(self, graph, distribution):
     """Test whether probability centered around 50% if no intercept given"""
     graph = StructureModel()
     graph.add_nodes_from(["A"])
     data = generate_binary_data(
         graph,
         1000000,
         distribution=distribution,
         noise_scale=0.1,
         seed=10,
         intercept=False,
     )
     assert 0.45 < data[:, 0].mean() < 0.55
Esempio n. 8
0
 def test_intercept_probability_logit(self, graph, distribution):
     """Test whether probability is not centered around 50% when using an intercept"""
     graph = StructureModel()
     graph.add_nodes_from(["A"])
     data = generate_binary_data(
         graph,
         1000000,
         distribution=distribution,
         noise_scale=0.1,
         seed=10,
         intercept=True,
     )
     mean_prob = data[:, 0].mean()
     assert not np.isclose(mean_prob, 0.5, atol=0.05)
Esempio n. 9
0
    def test_order_is_correct(self, graph_gen, num_nodes, seed):
        """
        Check if the order of the nodes is the same order as `sm.nodes`, which in turn is the same order as the
        adjacency matrix.

        To do so, we create graphs with degree in {0,1} by doing permutations on identity.
        The edge values are always 100 and the noise is 1, so we expect `edge_from` < `edge_to` in absolute value
        almost every time.
        """
        sm = graph_gen(num_nodes=num_nodes, seed=seed, weight=None)
        nodes = sm.nodes()
        node_seq = {node: ix for ix, node in enumerate(sm.nodes())}

        data = generate_binary_data(
            sm,
            n_samples=10000,
            distribution="normal",
            seed=seed,
            noise_scale=0.1,
            intercept=False,
        )
        tol = 0.15
        # since we dont have an intercept, the mean proba for the parent is 0.5,
        # which has the highest possible std for a binary feature (std= p(1-p)),
        # hence, any child necessarily has a lower probability.
        assert data[:, node_seq["aa"]].std() > data[:, node_seq["ab"]].std()

        for node in nodes:
            if node == "aa":
                continue
            joint_proba, factored_proba = calculate_proba(
                data, node_seq["aa"], node_seq[node])
            if node == "ab":
                # this is the only link
                assert not np.isclose(
                    joint_proba, factored_proba, rtol=tol, atol=0)
            else:
                assert np.isclose(joint_proba,
                                  factored_proba,
                                  rtol=tol,
                                  atol=0)
Esempio n. 10
0
 def test_number_of_samples(self, num_samples, graph):
     """Assert number of samples generated (rows) = num_samples"""
     data = generate_binary_data(graph, num_samples, "logit", 1, seed=10)
     assert len(data) == num_samples