Ejemplo n.º 1
0
 def visualize(self, nodes, edges, fname):
     """This function is used to visualize the causal model using graphviz"""
     from causalgraphicalmodels import CausalGraphicalModel
     import graphviz
     try:
         graph = CausalGraphicalModel(nodes=nodes, edges=edges)
         graph.draw().render(filename=fname)
     except AssertionError:
         print("[ERROR]: cycles in NOTEARS dag")
         print("Edges: {0}".format(edges))
Ejemplo n.º 2
0
import numpy as np
import pandas as pd
import pymc3 as pm
import arviz as ar
from sklearn.preprocessing import scale
import matplotlib.pyplot as pl
from causalgraphicalmodels import CausalGraphicalModel
from cmocean.cm import balance_r
from seaborn import heatmap
"""
Example: infer direct influence of both parents (P) and grand parents (G) on the
educational achievement of children (C).
"""
dag_ed1 = CausalGraphicalModel(nodes=['P', 'G', 'C'],
                               edges=[('G', 'P'), ('G', 'C'), ('P', 'C')])
dag_ed1.draw()
"""
But we suppose ther are unmeasured, common influences on parents and their
children (e.g. neighborhoods, not shared by grandparent who live elsewhere).
"""

dag_ed2 = CausalGraphicalModel(nodes=['G', 'P', 'C', 'U'],
                               edges=[('G', 'P'), ('U', 'P'), ('G', 'C'),
                                      ('P', 'C'), ('U', 'C')])
dag_ed2.draw()
"""
The DAG above implies that:
(1) P is some function of G and U
(2) C is some function of G, P, and U
(3) G and U are not functions of any other known variables.
        ("Cross_Bay_Bridge", "Mode_Choice"),
        ("HH_Size", "Mode_Choice"),
        ("num_of_kids_household", "Mode_Choice"),
        ("Autos_per_licensed_drivers", "Mode_Choice"),
        ("Gender", "Mode_Choice"),
        ("Travel_Distance", "Travel_Time"),
        ("Travel_Distance", "Travel_Cost"),
        ("Travel_Distance", "Cross_Bay_Bridge"),
        ("HH_Size", "Travel_Distance"),
        #         ("Travel_Time", "Mode_Choice"),
        #         ("Travel_Time", "Mode_Choice"),
    ],
)

# draw return a graphviz `dot` object, which jupyter can render
causal_graph.draw()
# -

# # MNL specification

# +
## Below is the specification of the true model used in the simulation of the choices

mnl_specification = OrderedDict()
mnl_names = OrderedDict()

mnl_specification["intercept"] = [2, 3, 4, 5, 6, 7, 8]
mnl_names["intercept"] = [
    "ASC Shared Ride: 2",
    "ASC Shared Ride: 3+",
    "ASC Walk-Transit-Walk",
Ejemplo n.º 4
0
    elif op == "grmo":
        bo = CausalGraphicalModel(nodes=[
            "dem", "prevDem", "partMarg", "prodDownTm", "partOrd", "boPartOrd",
            "prCap", "boPrCap", "bo", "profit"
        ],
                                  edges=[("dem", "boPartOrd"),
                                         ("prevDem", "partOrd"),
                                         ("partMarg", "partOrd"),
                                         ("partOrd", "boPartOrd"),
                                         ("prodDownTm", "prCap"),
                                         ("prCap", "boPrCap"),
                                         ("dem", "boPrCap"),
                                         ("boPartOrd", "bo"),
                                         ("boPrCap", "bo")])
        bo.draw()
        plt.show()

    elif op == "train":
        prFile = sys.argv[2]
        regressor = FeedForwardNetwork(prFile)
        regressor.buildModel()
        FeedForwardNetwork.batchTrain(regressor)

    elif op == "pred":
        prFile = sys.argv[2]
        regressor = FeedForwardNetwork(prFile)
        regressor.buildModel()
        FeedForwardNetwork.predict(regressor)

    elif op == "infer":
Ejemplo n.º 5
0
"""
Treatment appears to have negligible effect even though βF posterior indicates fungus impacts
growth.
The problem is that fungus is a consequence of treatment; i.e. fungus is a post-treatment variable.
The model asked the question "Once we know fungus is present does treatment matter?" ⇒ No.
The next model ignores the fungus variable
"""

with pm.Model() as m8:
    σ = pm.Exponential('σ', 1)
    α = pm.Lognormal('α', 0, 0.2)
    βT = pm.Normal('βT', 0, 0.5)
    p = α + βT * d.treatment.values
    μ = d.h0.values * p
    h1 = pm.Normal('h1', mu=μ, sd=σ, observed=d.h1.values)
    trc8 = pm.sample(tune=1000)

pm.summary(trc8)
"""
Now the treatment effect is plain to see. Note that:
1. It makes sense to control for pre-treatment differences such as initial height, h0, here.
2. Including post-treatment variables can mask the treatment itself.
3. Note that model m7 is still useful to identify the causal mechanism!
"""
plant_dag = CausalGraphicalModel(nodes=['H0', 'H1', 'T', 'F'],
                                 edges=[('H0', 'H1'), ('T', 'F'), ('F', 'H1')])
plant_dag.draw()
plant_dag.is_d_separated('T', 'H1')
plant_dag.is_d_separated('T', 'H1', 'F')
plant_dag.get_all_independence_relationships()
Ejemplo n.º 6
0
             varnames=['bM', 'bN'], rhat=False, alpha=0.11);
pd.plotting.scatter_matrix(dcc[['M', 'N', 'K']]);

"""
The regression model (m5_7) asks if high N is associated with high K. Likewise m5_7 asks whether
high M implies high K. Bigger species like apes (big M) have milk with less energy. But spp with
more neocortex (big N) have richer milk (big K). The fact that M and N are correlated makes these
relationships difficult to see unless both factors are accounted for.
----o----

Simulating a Masking Relationship. Two predictors (M, N) are correlated with one another, and one (M)
is positively correlated with the target (K) while the other (N) is negatively correlated with K
"""
div_msk = CausalGraphicalModel(nodes=['M', 'N', 'K'],
                               edges=[('M', 'K'), ('N', 'K'), ('M', 'N')])
div_msk.draw()

n = 100
M = np.random.normal(size=n)
N = np.random.normal(loc=M, size=n)
K = np.random.normal(loc=N-M, size=n)

d_sim = pd.DataFrame(dict(K=K, M=M, N=N))
pd.plotting.scatter_matrix(d_sim, alpha=0.5, diagonal=);


with pm.Model() as m5_sim:
    a = pm.Normal('a', 0, 0.2)
    bN = pm.Normal('bN', 0, 0.5)
    σ = pm.Exponential('sigma', 1)
    mu = a + bN * d_sim.N
Ejemplo n.º 7
0
    sigma = pm.Exponential('sigma', lam=1)
    D = pm.Normal('D', mu, sigma, observed=d.D)

div_dag = CausalGraphicalModel(nodes=['A', 'M', 'D'],
                               edges=[('A', 'M'), ('A', 'D'), ('M', 'D')])
div_dag2 = CausalGraphicalModel(nodes=['A', 'M', 'D'],
                                edges=[('A', 'M'), ('A', 'D')])

with m5_1:
    trace_5_1 = pm.sample(1000, tune=1000)
with m5_2:
    trace_5_2 = pm.sample(1000, tune=1000)
pm.forestplot(trace_5_1, varnames=['a', 'bA', 'sigma'])
pm.forestplot(trace_5_2, varnames=['a', 'bM', 'sigma'])
# The above is consistent with two causal DAGS:
div_dag.draw()
# and
div_dag2.draw()
"""
We need a model that CONTROLS FOR A while assessing the association between M and D
Fit a multiple regression to predict divorce using both marriage rate (M) and age at
marriage (A) to answer the questions:

1. Knowing marriage rate (M), what additional value is there in knowing age at marriage (A)
2. Knowing age at marriage (A), what additional value is there in knowing marriage rate (M)
"""

with pm.Model() as m5_3:
    sigma = pm.Exponential('sigma', lam=1)
    a = pm.Normal('a', 0, 0.2)
    bM = pm.Normal('bM', 0, 0.5)
Ejemplo n.º 8
0
        ("e", "y"),
        ("f", "y"),
        ("confounder", "y"),
        ("confounder", "a"),
        ("confounder", "b"),
        #         ("confounder", "c"),
        ("confounder", "d"),
        ("confounder", "e"),
        ("confounder", "f"),

        #         ("confounder", "y"),
        #         ("confounder", "y"),
    ])

# draw return a graphviz `dot` object, which jupyter can render
data_generation_graph.draw()
# -

# ## Specifiy the paramteric relationship between the covariates.
#
# This is where we specify the relationship between the variables in the Causal graph above. Specifically, we have an equation that relates each two connected nodes on the graph (so an equation for each edge).
#
# In the outcome model for y, the coefficient on each of the variables a, b, c, d, e and f represents it's $true$ causal effect. The coefficient on the confounder, however, does not represent its causal effect, because it has 5 decendents that are part of the outcome model.
#
# We are not ultimately interested in the causal effect of the confounder for this exercice, and instead are interested in the coefficients on each of its descendents.

# +
sample_size = 10000

confounder = np.random.normal(loc=20, scale=10, size=sample_size)
# -*- coding: utf-8 -*-
"""
Created on Mon Jun 29 09:55:00 2020

@author: Mhuth
"""

from causalgraphicalmodels import CausalGraphicalModel

nod = [
    'region', 'oral', 'sales bans', 'time', 'ideas about children', 'controls'
]

ed = [('region', 'oral'), ('region', 'sales bans'),
      ('region', 'ideas about children'), ('sales bans', 'oral'),
      ('time', 'oral'), ('time', 'ideas about children'), ('time', 'controls'),
      ('ideas about children', 'oral')]
cg1 = CausalGraphicalModel(nodes=nod, edges=ed)

cg1.draw()
        'utility_driving'
    ],
    edges=[
        ("total_travel_time", "utility_driving"),
        ("total_travel_cost", "utility_driving"),
        ("total_travel_distance", "utility_driving"),
        ("household_size", "utility_driving"),
        #         ("household_income", "utility_driving"),
        ("num_cars", "utility_driving"),
        ("cross_bay", "utility_driving"),
        ("total_travel_distance", "total_travel_time"),
        ("total_travel_distance", "total_travel_cost"),
    ])

# draw return a graphviz `dot` object, which jupyter can render
drive_alone_graph.draw()

# -

# ## Distributional regression
# Assume univariate linear approximation for the relationship between travel distance and travel time/cost. Turns out it's not a horrible assumption.


def fit_regression(X, y, data, plotting=True):
    data_x = sm.add_constant(data[X])
    data_y = data[y]

    model = sm.OLS(data_y, data_x)

    results = model.fit()
        ("Travel_Distance", "Travel_Cost"), 
        ("Travel_Distance", "Cross_Bay_Bridge"), 
        ("HH_Size", "Travel_Distance"), 

        
        
#         ("Travel_Time", "Mode_Choice"), 
#         ("Travel_Time", "Mode_Choice"), 

        
        
    ]
)

# draw return a graphviz `dot` object, which jupyter can render
sprinkler.draw()
# -

# # MNL specification

# +
# Create my specification and variable names for the basic MNL model
# NOTE: - Keys should be variables within the long format dataframe.
#         The sole exception to this is the "intercept" key.
#       - For the specification dictionary, the values should be lists
#         or lists of lists. Within a list, or within the inner-most
#         list should be the alternative ID's of the alternative whose
#         utility specification the explanatory variable is entering.

mnl_specification = OrderedDict()
mnl_names = OrderedDict()