def visualize(self, nodes, edges, fname): """This function is used to visualize the causal model using graphviz""" from causalgraphicalmodels import CausalGraphicalModel import graphviz try: graph = CausalGraphicalModel(nodes=nodes, edges=edges) graph.draw().render(filename=fname) except AssertionError: print("[ERROR]: cycles in NOTEARS dag") print("Edges: {0}".format(edges))
import numpy as np import pandas as pd import pymc3 as pm import arviz as ar from sklearn.preprocessing import scale import matplotlib.pyplot as pl from causalgraphicalmodels import CausalGraphicalModel from cmocean.cm import balance_r from seaborn import heatmap """ Example: infer direct influence of both parents (P) and grand parents (G) on the educational achievement of children (C). """ dag_ed1 = CausalGraphicalModel(nodes=['P', 'G', 'C'], edges=[('G', 'P'), ('G', 'C'), ('P', 'C')]) dag_ed1.draw() """ But we suppose ther are unmeasured, common influences on parents and their children (e.g. neighborhoods, not shared by grandparent who live elsewhere). """ dag_ed2 = CausalGraphicalModel(nodes=['G', 'P', 'C', 'U'], edges=[('G', 'P'), ('U', 'P'), ('G', 'C'), ('P', 'C'), ('U', 'C')]) dag_ed2.draw() """ The DAG above implies that: (1) P is some function of G and U (2) C is some function of G, P, and U (3) G and U are not functions of any other known variables.
("Cross_Bay_Bridge", "Mode_Choice"), ("HH_Size", "Mode_Choice"), ("num_of_kids_household", "Mode_Choice"), ("Autos_per_licensed_drivers", "Mode_Choice"), ("Gender", "Mode_Choice"), ("Travel_Distance", "Travel_Time"), ("Travel_Distance", "Travel_Cost"), ("Travel_Distance", "Cross_Bay_Bridge"), ("HH_Size", "Travel_Distance"), # ("Travel_Time", "Mode_Choice"), # ("Travel_Time", "Mode_Choice"), ], ) # draw return a graphviz `dot` object, which jupyter can render causal_graph.draw() # - # # MNL specification # + ## Below is the specification of the true model used in the simulation of the choices mnl_specification = OrderedDict() mnl_names = OrderedDict() mnl_specification["intercept"] = [2, 3, 4, 5, 6, 7, 8] mnl_names["intercept"] = [ "ASC Shared Ride: 2", "ASC Shared Ride: 3+", "ASC Walk-Transit-Walk",
elif op == "grmo": bo = CausalGraphicalModel(nodes=[ "dem", "prevDem", "partMarg", "prodDownTm", "partOrd", "boPartOrd", "prCap", "boPrCap", "bo", "profit" ], edges=[("dem", "boPartOrd"), ("prevDem", "partOrd"), ("partMarg", "partOrd"), ("partOrd", "boPartOrd"), ("prodDownTm", "prCap"), ("prCap", "boPrCap"), ("dem", "boPrCap"), ("boPartOrd", "bo"), ("boPrCap", "bo")]) bo.draw() plt.show() elif op == "train": prFile = sys.argv[2] regressor = FeedForwardNetwork(prFile) regressor.buildModel() FeedForwardNetwork.batchTrain(regressor) elif op == "pred": prFile = sys.argv[2] regressor = FeedForwardNetwork(prFile) regressor.buildModel() FeedForwardNetwork.predict(regressor) elif op == "infer":
""" Treatment appears to have negligible effect even though βF posterior indicates fungus impacts growth. The problem is that fungus is a consequence of treatment; i.e. fungus is a post-treatment variable. The model asked the question "Once we know fungus is present does treatment matter?" ⇒ No. The next model ignores the fungus variable """ with pm.Model() as m8: σ = pm.Exponential('σ', 1) α = pm.Lognormal('α', 0, 0.2) βT = pm.Normal('βT', 0, 0.5) p = α + βT * d.treatment.values μ = d.h0.values * p h1 = pm.Normal('h1', mu=μ, sd=σ, observed=d.h1.values) trc8 = pm.sample(tune=1000) pm.summary(trc8) """ Now the treatment effect is plain to see. Note that: 1. It makes sense to control for pre-treatment differences such as initial height, h0, here. 2. Including post-treatment variables can mask the treatment itself. 3. Note that model m7 is still useful to identify the causal mechanism! """ plant_dag = CausalGraphicalModel(nodes=['H0', 'H1', 'T', 'F'], edges=[('H0', 'H1'), ('T', 'F'), ('F', 'H1')]) plant_dag.draw() plant_dag.is_d_separated('T', 'H1') plant_dag.is_d_separated('T', 'H1', 'F') plant_dag.get_all_independence_relationships()
varnames=['bM', 'bN'], rhat=False, alpha=0.11); pd.plotting.scatter_matrix(dcc[['M', 'N', 'K']]); """ The regression model (m5_7) asks if high N is associated with high K. Likewise m5_7 asks whether high M implies high K. Bigger species like apes (big M) have milk with less energy. But spp with more neocortex (big N) have richer milk (big K). The fact that M and N are correlated makes these relationships difficult to see unless both factors are accounted for. ----o---- Simulating a Masking Relationship. Two predictors (M, N) are correlated with one another, and one (M) is positively correlated with the target (K) while the other (N) is negatively correlated with K """ div_msk = CausalGraphicalModel(nodes=['M', 'N', 'K'], edges=[('M', 'K'), ('N', 'K'), ('M', 'N')]) div_msk.draw() n = 100 M = np.random.normal(size=n) N = np.random.normal(loc=M, size=n) K = np.random.normal(loc=N-M, size=n) d_sim = pd.DataFrame(dict(K=K, M=M, N=N)) pd.plotting.scatter_matrix(d_sim, alpha=0.5, diagonal=); with pm.Model() as m5_sim: a = pm.Normal('a', 0, 0.2) bN = pm.Normal('bN', 0, 0.5) σ = pm.Exponential('sigma', 1) mu = a + bN * d_sim.N
sigma = pm.Exponential('sigma', lam=1) D = pm.Normal('D', mu, sigma, observed=d.D) div_dag = CausalGraphicalModel(nodes=['A', 'M', 'D'], edges=[('A', 'M'), ('A', 'D'), ('M', 'D')]) div_dag2 = CausalGraphicalModel(nodes=['A', 'M', 'D'], edges=[('A', 'M'), ('A', 'D')]) with m5_1: trace_5_1 = pm.sample(1000, tune=1000) with m5_2: trace_5_2 = pm.sample(1000, tune=1000) pm.forestplot(trace_5_1, varnames=['a', 'bA', 'sigma']) pm.forestplot(trace_5_2, varnames=['a', 'bM', 'sigma']) # The above is consistent with two causal DAGS: div_dag.draw() # and div_dag2.draw() """ We need a model that CONTROLS FOR A while assessing the association between M and D Fit a multiple regression to predict divorce using both marriage rate (M) and age at marriage (A) to answer the questions: 1. Knowing marriage rate (M), what additional value is there in knowing age at marriage (A) 2. Knowing age at marriage (A), what additional value is there in knowing marriage rate (M) """ with pm.Model() as m5_3: sigma = pm.Exponential('sigma', lam=1) a = pm.Normal('a', 0, 0.2) bM = pm.Normal('bM', 0, 0.5)
("e", "y"), ("f", "y"), ("confounder", "y"), ("confounder", "a"), ("confounder", "b"), # ("confounder", "c"), ("confounder", "d"), ("confounder", "e"), ("confounder", "f"), # ("confounder", "y"), # ("confounder", "y"), ]) # draw return a graphviz `dot` object, which jupyter can render data_generation_graph.draw() # - # ## Specifiy the paramteric relationship between the covariates. # # This is where we specify the relationship between the variables in the Causal graph above. Specifically, we have an equation that relates each two connected nodes on the graph (so an equation for each edge). # # In the outcome model for y, the coefficient on each of the variables a, b, c, d, e and f represents it's $true$ causal effect. The coefficient on the confounder, however, does not represent its causal effect, because it has 5 decendents that are part of the outcome model. # # We are not ultimately interested in the causal effect of the confounder for this exercice, and instead are interested in the coefficients on each of its descendents. # + sample_size = 10000 confounder = np.random.normal(loc=20, scale=10, size=sample_size)
# -*- coding: utf-8 -*- """ Created on Mon Jun 29 09:55:00 2020 @author: Mhuth """ from causalgraphicalmodels import CausalGraphicalModel nod = [ 'region', 'oral', 'sales bans', 'time', 'ideas about children', 'controls' ] ed = [('region', 'oral'), ('region', 'sales bans'), ('region', 'ideas about children'), ('sales bans', 'oral'), ('time', 'oral'), ('time', 'ideas about children'), ('time', 'controls'), ('ideas about children', 'oral')] cg1 = CausalGraphicalModel(nodes=nod, edges=ed) cg1.draw()
'utility_driving' ], edges=[ ("total_travel_time", "utility_driving"), ("total_travel_cost", "utility_driving"), ("total_travel_distance", "utility_driving"), ("household_size", "utility_driving"), # ("household_income", "utility_driving"), ("num_cars", "utility_driving"), ("cross_bay", "utility_driving"), ("total_travel_distance", "total_travel_time"), ("total_travel_distance", "total_travel_cost"), ]) # draw return a graphviz `dot` object, which jupyter can render drive_alone_graph.draw() # - # ## Distributional regression # Assume univariate linear approximation for the relationship between travel distance and travel time/cost. Turns out it's not a horrible assumption. def fit_regression(X, y, data, plotting=True): data_x = sm.add_constant(data[X]) data_y = data[y] model = sm.OLS(data_y, data_x) results = model.fit()
("Travel_Distance", "Travel_Cost"), ("Travel_Distance", "Cross_Bay_Bridge"), ("HH_Size", "Travel_Distance"), # ("Travel_Time", "Mode_Choice"), # ("Travel_Time", "Mode_Choice"), ] ) # draw return a graphviz `dot` object, which jupyter can render sprinkler.draw() # - # # MNL specification # + # Create my specification and variable names for the basic MNL model # NOTE: - Keys should be variables within the long format dataframe. # The sole exception to this is the "intercept" key. # - For the specification dictionary, the values should be lists # or lists of lists. Within a list, or within the inner-most # list should be the alternative ID's of the alternative whose # utility specification the explanatory variable is entering. mnl_specification = OrderedDict() mnl_names = OrderedDict()