Esempio n. 1
0
def runSearch(data,algo_var_types):
    # run the search
    ic_algorithm = IC(RobustRegressionTest)
    graph = ic_algorithm.search(data, algo_var_types)

    results = graph.edges(data=True)
    #pprint(results)
    return (results)
Esempio n. 2
0
class Test_IC(TestAPI):

    def setUp(self):
        x1 = numpy.random.normal(size=TEST_SET_SIZE)
        x2 = x1 + numpy.random.normal(size=TEST_SET_SIZE)
        x3 = x1 + numpy.random.normal(size=TEST_SET_SIZE)
        x4 = x2 + x3 + numpy.random.normal(size=TEST_SET_SIZE)
        x5 = x4 + numpy.random.normal(size=TEST_SET_SIZE)
        self.X = pd.DataFrame({'x1' : x1, 'x2' : x2, 'x3' : x3, 'x4' : x4, 'x5' : x5})
        self.variable_types = {'x1' : 'c', 'x2' : 'c', 'x3' : 'c', 'x4' : 'c', 'x5' : 'c'}
        self.true_neighbors = { 'x1' : set(['x2','x3']),
                                'x2' : set(['x1','x4']),
                                'x3' : set(['x1','x4']),
                                'x4' : set(['x2','x3','x5']),
                                'x5' : set(['x4'])}
        self.true_colliders = set([('x3','x4'), ('x2','x4')])
        self.true_marked = set([('x4','x5')])
        self.ic = IC(RobustRegressionTest, self.X, self.variable_types)

    def test_build_g(self):
        self.ic._build_g()
        V = len(self.X.columns)
        assert(len(self.ic._g.edges()) == (V-1)*V / 2) 
        assert(set(self.ic._g.nodes()) == set(self.variable_types.keys()))
        for node, variable_type in self.variable_types.items():
            assert(self.ic._g.node[node]['type'] == variable_type)
        for i, j in self.ic._g.edges():
            assert(self.ic._g.edge[i][j]['marked'] == False)

    def test_find_skeleton(self):
        self.ic._build_g()
        self.ic._find_skeleton()
        for node, neighbors in self.true_neighbors.items():
            assert(set(self.ic._g.neighbors(node)) == neighbors)
            
    def test_orient_colliders(self):
        self.ic._build_g()
        self.ic._find_skeleton()
        self.ic._orient_colliders()
        for i, j in self.ic._g.edges():
            measured_colliders = self.ic._g.edge[i][j]['arrows']
            if len(measured_colliders) > 0:
                if j in measured_colliders:
                    assert((i,j) in self.true_colliders)
                else:
                    assert((j,i) in self.true_colliders)
            else:
                assert((i,j) not in self.true_colliders and (j,i) not in self.true_colliders)

    def test_separating_set(self):
        self.ic._build_g()
        self.ic._find_skeleton()
        for xi, xj in itertools.combinations(self.variable_types.keys(), 2):
            if not self.ic._g.has_edge(xi,xj):
                if (xi,xj) in self.ic.separating_sets:
                    z = self.ic.separating_sets[(xi,xj)]
                else:
                    z = self.ic.separating_sets[(xj,xi)]
                test = self.ic.independence_test([xj],[xi], list(z), self.X, self.ic.alpha)
                assert(test.independent())

    def test_marked_directed_path(self):
        marked_edges = [('a','b'),('b','c'),('c','d')]
        unmarked_edges = [('a','d')]
        nodes = ['a','b','c','d']
        g = nx.Graph()
        g.add_edges_from(marked_edges, marked=True)
        g.add_edges_from(unmarked_edges, marked=False)
        for i, j in (marked_edges + unmarked_edges):
            g.edge[i][j]['arrows'] = [j]
        self.ic._g = g
        assert(self.ic._marked_directed_path('a','d'))
        assert(not self.ic._marked_directed_path('d','a'))

    def test_recursion_rule_1(self):
        pass

    def test_recursion_rule_2(self):
        pass

    def test_search(self):
        self.ic.search()
        for i, j in self.ic._g.edges():
            if self.ic._g.edge[i][j]['marked']:
                assert( (i,j) in self.true_marked or (j,i) in self.true_marked)
            else:
                assert( (i,j) not in self.true_marked and (j,i) not in self.true_marked)
                 
Esempio n. 3
0
class Test_IC(TestAPI):

    def setUp(self):
        x1 = numpy.random.normal(size=TEST_SET_SIZE)
        x2 = x1 + numpy.random.normal(size=TEST_SET_SIZE)
        x3 = x1 + numpy.random.normal(size=TEST_SET_SIZE)
        x4 = x2 + x3 + numpy.random.normal(size=TEST_SET_SIZE)
        x5 = x4 + numpy.random.normal(size=TEST_SET_SIZE)
        self.X = pd.DataFrame({'x1' : x1, 'x2' : x2, 'x3' : x3, 'x4' : x4, 'x5' : x5})
        self.variable_types = {'x1' : 'c', 'x2' : 'c', 'x3' : 'c', 'x4' : 'c', 'x5' : 'c'}
        self.true_neighbors = { 'x1' : set(['x2','x3']),
                                'x2' : set(['x1','x4']),
                                'x3' : set(['x1','x4']),
                                'x4' : set(['x2','x3','x5']),
                                'x5' : set(['x4'])}
        self.true_colliders = set([('x3','x4'), ('x2','x4')])
        self.true_marked = set([('x4','x5')])
        self.ic = IC(RobustRegressionTest, self.X, self.variable_types)

    def test_build_g(self):
        self.ic._build_g()
        V = len(self.X.columns)
        assert(len(self.ic._g.edges()) == (V-1)*V / 2) 
        assert(set(self.ic._g.nodes()) == set(self.variable_types.keys()))
        for node, variable_type in self.variable_types.items():
            assert(self.ic._g.node[node]['type'] == variable_type)
        for i, j in self.ic._g.edges():
            assert(self.ic._g.edge[i][j]['marked'] == False)

    def test_find_skeleton(self):
        self.ic._build_g()
        self.ic._find_skeleton()
        for node, neighbors in self.true_neighbors.items():
            assert(set(self.ic._g.neighbors(node)) == neighbors)
            
    def test_orient_colliders(self):
        self.ic._build_g()
        self.ic._find_skeleton()
        self.ic._orient_colliders()
        for i, j in self.ic._g.edges():
            measured_colliders = self.ic._g.edge[i][j]['arrows']
            if len(measured_colliders) > 0:
                if j in measured_colliders:
                    assert((i,j) in self.true_colliders)
                else:
                    assert((j,i) in self.true_colliders)
            else:
                assert((i,j) not in self.true_colliders and (j,i) not in self.true_colliders)

    def test_separating_set(self):
        self.ic._build_g()
        self.ic._find_skeleton()
        for xi, xj in itertools.combinations(self.variable_types.keys(), 2):
            if not self.ic._g.has_edge(xi,xj):
                if (xi,xj) in self.ic.separating_sets:
                    z = self.ic.separating_sets[(xi,xj)]
                else:
                    z = self.ic.separating_sets[(xj,xi)]
                test = self.ic.independence_test([xj],[xi], list(z), self.X, self.ic.alpha)
                assert(test.independent())

    def test_marked_directed_path(self):
        marked_edges = [('a','b'),('b','c'),('c','d')]
        unmarked_edges = [('a','d')]
        nodes = ['a','b','c','d']
        g = nx.Graph()
        g.add_edges_from(marked_edges, marked=True)
        g.add_edges_from(unmarked_edges, marked=False)
        for i, j in (marked_edges + unmarked_edges):
            g.edge[i][j]['arrows'] = [j]
        self.ic._g = g
        assert(self.ic._marked_directed_path('a','d'))
        assert(not self.ic._marked_directed_path('d','a'))

    def test_recursion_rule_1(self):
        pass

    def test_recursion_rule_2(self):
        pass

    def test_search(self):
        self.ic.search()
        for i, j in self.ic._g.edges():
            if self.ic._g.edge[i][j]['marked']:
                assert( (i,j) in self.true_marked or (j,i) in self.true_marked)
            else:
                assert( (i,j) not in self.true_marked and (j,i) not in self.true_marked)
Esempio n. 4
0
x1 = np.random.normal(size=SIZE)
x2 = x1 + np.random.normal(size=SIZE)
x3 = x1 + np.random.normal(size=SIZE)
x4 = x2 + x3 + np.random.normal(size=SIZE)
x5 = x4 + np.random.normal(size=SIZE)

# load the data into a dataframe:
X = pd.DataFrame({'x1': x1, 'x2': x2, 'x3': x3, 'x4': x4, 'x5': x5})

# define the variable types: 'c' is 'continuous'.  The variables defined here
# are the ones the search is performed over  -- NOT all the variables defined
# in the data frame.
variable_types = {'x1': 'c', 'x2': 'c', 'x3': 'c', 'x4': 'c', 'x5': 'c'}

ic_algorithm = IC(RobustRegressionTest)
graph = ic_algorithm.search(X, variable_types)

e = graph.edges(data=True)
print(f"{e}")

SIZE = 2000
x1 = np.random.normal(size=SIZE)
x2 = x1 + np.random.normal(size=SIZE)
x3 = x1 + np.random.normal(size=SIZE)
x6 = np.random.normal(size=SIZE)
x4 = x2 + x3 + x6 + np.random.normal(size=SIZE)
x5 = x6 + np.random.normal(size=SIZE)

# load the data into a dataframe:
X = pd.DataFrame({'x1': x1, 'x2': x2, 'x3': x3, 'x4': x4, 'x5': x5})
Esempio n. 5
0
    def fit(self, X):
        '''
      Copulafit using Gaussian copula with marginals evaluated by Gaussian KDE
      Precision matrix is evaluated using specified method, default to graphical LASSO
      :param X: input dataset
      :return: estimated precision matrix rho
      '''

        N, d = X.shape
        if self.scaler is not None:
            X_scale = self.scaler.fit_transform(X)
        else:
            X_scale = X
        if len(self.vertexes) == 0:
            self.vertexes = [str(id) for id in range(d)]

        self.theta = 1.0 / N
        cum_marginals = np.zeros_like(X)
        inv_norm_cdf = np.zeros_like(X)
        # inv_norm_cdf_scaled = np.zeros_like(X)
        self.kernels = list([])
        # TODO: complexity O(Nd) is high
        if self.verbose:
            colored('>> Computing marginals', color='blue')
        for j in range(cum_marginals.shape[1]):
            self.kernels.append(gaussian_kde(X_scale[:, j]))
            cum_pdf_overall = self.kernels[-1].integrate_box_1d(
                X_scale[:, j].min(), X_scale[:, j].max())
            for i in range(cum_marginals.shape[0]):
                cum_marginals[i, j] = self.kernels[-1].integrate_box_1d(
                    X_scale[:, j].min(), X_scale[i, j]) / cum_pdf_overall
                # truncate cumulative marginals
                if cum_marginals[i, j] < self.theta:
                    cum_marginals[i, j] = self.theta
                elif cum_marginals[i, j] > 1 - self.theta:
                    cum_marginals[i, j] = 1 - self.theta
                # inverse of normal CDF: \Phi(F_j(x))^{-1}
                inv_norm_cdf[i, j] = norm.ppf(cum_marginals[i, j])
                # scaled to preserve mean and variance: u_j + \sigma_j*\Phi(F_j(x))^{-1}
                # inv_norm_cdf_scaled[i, j] = X_scale[:, j].mean() + X_scale[:, j].std() * inv_norm_cdf[i, j]

        if self.method == 'mle':
            # maximum-likelihood estiamtor
            empirical_cov = EmpiricalCovariance()
            empirical_cov.fit(inv_norm_cdf)
            if self.verbose:
                print colored('>> Running MLE to estiamte precision matrix',
                              color='blue')

            self.est_cov = empirical_cov.covariance_
            self.corr = scale_matrix(self.est_cov)
            self.precision_ = inv(empirical_cov.covariance_)

        if self.method == 'glasso':
            if self.verbose:
                print colored('>> Running glasso to estiamte precision matrix',
                              color='blue')

            empirical_cov = EmpiricalCovariance()
            empirical_cov.fit(inv_norm_cdf)
            # shrunk convariance to avoid numerical instability
            shrunk_cov = shrunk_covariance(empirical_cov.covariance_,
                                           shrinkage=0.8)
            self.est_cov, self.precision_ = graph_lasso(emp_cov=shrunk_cov,
                                                        alpha=self.penalty,
                                                        verbose=self.verbose,
                                                        max_iter=self.max_iter)
            self.corr = scale_matrix(self.est_cov)

        if self.method == 'ledoit_wolf':
            if self.verbose:
                print colored(
                    '>> Running ledoit_wolf to estiamte precision matrix',
                    color='blue')

            self.est_cov, _ = ledoit_wolf(inv_norm_cdf)
            self.corr = scale_matrix(self.est_cov)
            self.precision_ = linalg.inv(self.est_cov)

        if self.method == 'spectral':
            '''L2 mehtod, use paper Inverse covariance estimation for high dimension data in linear time and space
         :formular: in paper eq(8)
         '''
            if self.verbose:
                print colored(
                    '>> Running Riccati to estiamte precision matrix',
                    color='blue')

            # TODO: note estimated cov is sample cov
            self.est_cov, self.precision_ = spectral(inv_norm_cdf,
                                                     rho=2 * self.penalty,
                                                     assume_centered=False)
            self.corr = scale_matrix(self.est_cov)

        if self.method == 'pc':
            clf = pgmlearner.PGMLearner()
            data_list = list([])
            for row_id in range(X_scale.shape[0]):
                instance = dict()
                for i, n in enumerate(self.vertexes):
                    instance[n] = X_scale[row_id, i]
                data_list.append(instance)
            graph = clf.lg_constraint_estimatestruct(data=data_list,
                                                     pvalparam=self.pval,
                                                     bins=self.bins)
            dag = np.zeros(shape=(len(graph.V), len(graph.V)))
            for e in graph.E:
                dag[self.vertexes.index(e[0]), self.vertexes.index(e[1])] = 1
            self.conditional_independences_ = dag

        if self.method == 'ic':
            df = dict()
            variable_types = dict()
            for j in range(X_scale.shape[1]):
                df[self.vertexes[j]] = X_scale[:, j]
                variable_types[self.vertexes[j]] = 'c'
            data = pd.DataFrame(df)
            # run the search
            ic_algorithm = IC(RobustRegressionTest,
                              data,
                              variable_types,
                              alpha=self.pval)
            graph = ic_algorithm.search()
            dag = np.zeros(shape=(X_scale.shape[1], X_scale.shape[1]))
            for e in graph.edges(data=True):
                i = self.vertexes.index(e[0])
                j = self.vertexes.index(e[1])
                dag[i, j] = 1
                dag[j, i] = 1
                arrows = set(e[2]['arrows'])
                head_len = len(arrows)
                if head_len > 0:
                    head = arrows.pop()
                    if head_len == 1 and head == e[0]:
                        dag[i, j] = 0
                    if head_len == 1 and head == e[1]:
                        dag[j, i] = 0
            self.conditional_independences_ = dag

        # finally we fit the structure
        self.fit_structure(self.precision_)
Esempio n. 6
0
class Test_IC(TestAPI):
    def setUp(self):
        x1 = numpy.random.normal(size=TEST_SET_SIZE)
        x2 = x1 + numpy.random.normal(size=TEST_SET_SIZE)
        x3 = x1 + numpy.random.normal(size=TEST_SET_SIZE)
        x4 = x2 + x3 + numpy.random.normal(size=TEST_SET_SIZE)
        x5 = x4 + numpy.random.normal(size=TEST_SET_SIZE)
        self.X = pd.DataFrame({
            "x1": x1,
            "x2": x2,
            "x3": x3,
            "x4": x4,
            "x5": x5
        })
        self.variable_types = {
            "x1": "c",
            "x2": "c",
            "x3": "c",
            "x4": "c",
            "x5": "c"
        }
        self.true_neighbors = {
            "x1": set(["x2", "x3"]),
            "x2": set(["x1", "x4"]),
            "x3": set(["x1", "x4"]),
            "x4": set(["x2", "x3", "x5"]),
            "x5": set(["x4"]),
        }
        self.true_colliders = set([("x3", "x4"), ("x2", "x4")])
        self.true_marked = set([("x4", "x5")])
        self.ic = IC(RobustRegressionTest)
        self.ic.search(self.X, self.variable_types)

    def test_build_g(self):
        self.ic._build_g(self.variable_types)
        V = len(self.X.columns)
        assert len(self.ic._g.edges()) == (V - 1) * V / 2
        assert set(self.ic._g.nodes()) == set(self.variable_types.keys())
        for node, variable_type in self.variable_types.items():
            assert self.ic._g.nodes[node]["type"] == variable_type
        for i, j in self.ic._g.edges():
            assert self.ic._g.get_edge_data(i, j)["marked"] == False

    def test_find_skeleton(self):
        self.ic._build_g(self.variable_types)
        self.ic._find_skeleton(self.X, self.variable_types)
        for node, neighbors in self.true_neighbors.items():
            assert set(self.ic._g.neighbors(node)) == neighbors

    def test_orient_colliders(self):
        self.ic._build_g(self.variable_types)
        self.ic._find_skeleton(self.X, self.variable_types)
        self.ic._orient_colliders()
        for i, j in self.ic._g.edges():
            measured_colliders = self.ic._g.get_edge_data(i, j)["arrows"]
            if len(measured_colliders) > 0:
                if j in measured_colliders:
                    assert (i, j) in self.true_colliders
                else:
                    assert (j, i) in self.true_colliders
            else:
                assert (i, j) not in self.true_colliders and (
                    j,
                    i,
                ) not in self.true_colliders

    def test_separating_set(self):
        self.ic._build_g(self.variable_types)
        self.ic._find_skeleton(self.X, self.variable_types)
        for xi, xj in itertools.combinations(self.variable_types.keys(), 2):
            if not self.ic._g.has_edge(xi, xj):
                if (xi, xj) in self.ic.separating_sets:
                    z = self.ic.separating_sets[(xi, xj)]
                else:
                    z = self.ic.separating_sets[(xj, xi)]
                test = self.ic.independence_test([xj], [xi], list(z), self.X,
                                                 self.ic.alpha)
                assert test.independent()

    def test_marked_directed_path(self):
        marked_edges = [("a", "b"), ("b", "c"), ("c", "d")]
        unmarked_edges = [("a", "d")]
        nodes = ["a", "b", "c", "d"]
        g = nx.Graph()
        g.add_edges_from(marked_edges, marked=True)
        g.add_edges_from(unmarked_edges, marked=False)
        for i, j in marked_edges + unmarked_edges:
            g.get_edge_data(i, j)["arrows"] = [j]
        self.ic._g = g
        assert self.ic._marked_directed_path("a", "d")
        assert not self.ic._marked_directed_path("d", "a")

    def test_recursion_rule_1(self):
        pass

    def test_recursion_rule_2(self):
        pass

    def test_search(self):
        self.ic.search(self.X, self.variable_types)
        for i, j in self.ic._g.edges():
            if self.ic._g.get_edge_data(i, j)["marked"]:
                assert (i, j) in self.true_marked or (j, i) in self.true_marked
            else:
                assert (i, j) not in self.true_marked and (
                    j, i) not in self.true_marked
Esempio n. 7
0
import numpy
import pandas as pd

from causality.inference.search import IC
from causality.inference.independence_tests import RobustRegressionTest

# generate some toy data:
SIZE = 2000
x1 = numpy.random.normal(size=SIZE)
x2 = x1 + numpy.random.normal(size=SIZE)
x3 = x1 + numpy.random.normal(size=SIZE)
x4 = x2 + x3 + numpy.random.normal(size=SIZE)
x5 = x4 + numpy.random.normal(size=SIZE)

# load the data into a dataframe:
X = pd.DataFrame({'x1' : x1, 'x2' : x2, 'x3' : x3, 'x4' : x4, 'x5' : x5})

# define the variable types: 'c' is 'continuous'.  The variables defined here
# are the ones the search is performed over  -- NOT all the variables defined
# in the data frame.
variable_types = {'x1' : 'c', 'x2' : 'c', 'x3' : 'c', 'x4' : 'c', 'x5' : 'c'}

# run the search
ic_algorithm = IC(RobustRegressionTest, X, variable_types)
graph = ic_algorithm.search()