def chi2_test(data): """ Test null hypothesis that P(X,Y,Z) = P(Z)P(X|Z)P(Y|Z) versus empirically observed P(X,Y,Z) in the data using the traditional chisquare test based on observed versus expected frequency bins. Steps - Calculate P(XYZ) empirically and expected - Compute ddof - Perfom one-way chisquare Arguments --------- *data* : a nested numpy array The data from which to learn - must have at least three variables. All conditioned variables (i.e. Z) are compressed into one variable. Returns ------- *chi2_statistic* : a float Chisquare statistic *p_val* : a float The pvalue from the chi2 and ddof Effects ------- None Notes ----- - Assuming for now that |Z| = 1... generalize later - Should generalize to let data be a Pandas DataFrame --> would encourage external use. """ # compress extra Z variables at the start.. not implemented yet #bins = np.amax(data, axis=0)+1 bins = unique_bins(data) hist, _ = np.histogramdd(data, bins=bins) Pxyz = hist / hist.sum() # joint probability distribution over X,Y,Z Pz = np.sum(Pxyz, axis=(0, 1)) # P(Z) Pxz = np.sum(Pxyz, axis=1) # P(X,Z) Pyz = np.sum(Pxyz, axis=0) # P(Y,Z) Px_z = Pxz / (Pz + 1e-7) # P(X | Z) = P(X,Z) / P(Z) Py_z = Pyz / (Pz + 1e-7) # P(Y | Z) = P(Y,Z) / P(Z) observed_dist = Pxyz # Empirical distribution #Not correct right now -> Pz is wrong dimension Px_y_z = np.empty((Pxy_z.shape)) # P(Z)P(X|Z)P(Y|Z) for i in xrange(bins[0]): for j in xrange(bins[1]): for k in xrange(bins[2]): Px_y_z[i][j][k] = Px_z[i][k] * Py_z[j][k] Px_y_z *= Pz observed = observed_dist.flatten() * len(data) expected = expected_dist.flatten() * len(data) ddof = (bins[0] - 1) * (bins[1] - 1) * bins[2] chi2_statistic, p_val = stats.chisquare(observed, expected, ddof=ddof) return chi2_statistic, p_val
def entropy(data): """ In the context of structure learning, and more specifically in constraint-based algorithms which rely on the mutual information test for conditional independence, it has been proven that the variable X in a set which MAXIMIZES mutual information is also the variable which MINIMIZES entropy. This fact can be used to reduce the computational requirements of tests based on the following relationship: Entropy is related to marginal mutual information as follows: MI(X;Y) = H(X) - H(X|Y) Entropy is related to conditional mutual information as follows: MI(X;Y|Z) = H(X|Z) - H(X|Y,Z) For one varibale, H(X) is equal to the following: -1 * sum of p(x) * log(p(x)) For two variables H(X|Y) is equal to the following: sum over x,y of p(x,y)*log(p(y)/p(x,y)) For three variables, H(X|Y,Z) is equal to the following: -1 * sum of p(x,y,z) * log(p(x|y,z)), where p(x|y,z) = p(x,y,z)/p(y)*p(z) Arguments ---------- *data* : a nested numpy array The data from which to learn - must have at least three variables. All conditioned variables (i.e. Z) are compressed into one variable. Returns ------- *H* : entropy value """ try: cols = data.shape[1] except IndexError: cols = 1 #bins = np.amax(data,axis=0) bins = unique_bins(data) if cols == 1: hist, _ = np.histogramdd(data, bins=(bins)) # frequency counts Px = hist / hist.sum() H = -1 * np.sum(Px * np.log(Px)) elif cols == 2: # two variables -> assume X then Y hist, _ = np.histogramdd(data, bins=bins[0:2]) # frequency counts Pxy = hist / hist.sum() # joint probability distribution over X,Y,Z Py = np.sum(Pxy, axis=0) # P(Y) Py += 1e-7 Pxy += 1e-7 H = np.sum(Pxy * np.log(Py / Pxy)) else: # CHECK FOR > 3 COLUMNS -> concatenate Z into one column if cols > 3: data = data.astype('str') ncols = len(bins) for i in xrange(len(data)): data[i, 2] = ''.join(data[i, 2:ncols]) data = data.astype('int')[:, 0:3] bins = np.amax(data, axis=0) hist, _ = np.histogramdd(data, bins=bins) # frequency counts Pxyz = hist / hist.sum() # joint probability distribution over X,Y,Z Pyz = np.sum(Pxyz, axis=0) Pxyz += 1e-7 # for log -inf Pyz += 1e-7 H = -1 * np.sum(Pxyz * np.log(Pxyz)) + np.sum(Pyz * np.log(Pyz)) return round(H, 4)
def mutual_information(data, conditional=False): #bins = np.amax(data, axis=0)+1 # read levels for each variable bins = unique_bins(data) if len(bins) == 1: hist, _ = np.histogramdd(data, bins=(bins)) # frequency counts Px = hist / hist.sum() MI = -1 * np.sum(Px * np.log(Px)) return round(MI, 4) if len(bins) == 2: hist, _ = np.histogramdd(data, bins=bins[0:2]) # frequency counts Pxy = hist / hist.sum() # joint probability distribution over X,Y,Z Px = np.sum(Pxy, axis=1) # P(X,Z) Py = np.sum(Pxy, axis=0) # P(Y,Z) PxPy = np.outer(Px, Py) Pxy += 1e-7 PxPy += 1e-7 MI = np.sum(Pxy * np.log(Pxy / (PxPy))) return round(MI, 4) elif len(bins) > 2 and conditional == True: # CHECK FOR > 3 COLUMNS -> concatenate Z into one column if len(bins) > 3: data = data.astype('str') ncols = len(bins) for i in xrange(len(data)): data[i, 2] = ''.join(data[i, 2:ncols]) data = data.astype('int')[:, 0:3] bins = np.amax(data, axis=0) hist, _ = np.histogramdd(data, bins=bins) # frequency counts Pxyz = hist / hist.sum() # joint probability distribution over X,Y,Z Pz = np.sum(Pxyz, axis=(0, 1)) # P(Z) Pxz = np.sum(Pxyz, axis=1) # P(X,Z) Pyz = np.sum(Pxyz, axis=0) # P(Y,Z) Pxy_z = Pxyz / (Pz + 1e-7) # P(X,Y | Z) = P(X,Y,Z) / P(Z) Px_z = Pxz / (Pz + 1e-7) # P(X | Z) = P(X,Z) / P(Z) Py_z = Pyz / (Pz + 1e-7) # P(Y | Z) = P(Y,Z) / P(Z) Px_y_z = np.empty((Pxy_z.shape)) # P(X|Z)P(Y|Z) for i in xrange(bins[0]): for j in xrange(bins[1]): for k in xrange(bins[2]): Px_y_z[i][j][k] = Px_z[i][k] * Py_z[j][k] Pxyz += 1e-7 Pxy_z += 1e-7 Px_y_z += 1e-7 MI = np.sum(Pxyz * np.log(Pxy_z / (Px_y_z))) return round(MI, 4) elif len(bins) > 2 and conditional == False: data = data.astype('str') ncols = len(bins) for i in xrange(len(data)): data[i, 1] = ''.join(data[i, 1:ncols]) data = data.astype('int')[:, 0:2] hist, _ = np.histogramdd(data, bins=bins[0:2]) # frequency counts Pxy = hist / hist.sum() # joint probability distribution over X,Y,Z Px = np.sum(Pxy, axis=1) # P(X,Z) Py = np.sum(Pxy, axis=0) # P(Y,Z) PxPy = np.outer(Px, Py) Pxy += 1e-7 PxPy += 1e-7 MI = np.sum(Pxy * np.log(Pxy / (PxPy))) return round(MI, 4)
def fast_iamb(data, k=5, alpha=0.05, feature_selection=None, debug=False): """ From [1]: "A novel algorithm for the induction of Markov blankets from data, called Fast-IAMB, that employs a heuristic to quickly recover the Markov blanket. Empirical results show that Fast-IAMB performs in many cases faster and more reliably than existing algorithms without adversely affecting the accuracy of the recovered Markov blankets." Arguments --------- *data* : a nested numpy array *k* : an integer The max number of edges to add at each iteration of the algorithm. *alpha* : a float Probability of Type I error Returns ------- *bn* : a BayesNet object Effects ------- None Notes ----- - Currently does not work. I think it's stuck in an infinite loop... """ # get values value_dict = dict(zip(range(data.shape[1]), [list(np.unique(col)) for col in data.T])) # replace strings data = replace_strings(data) n_rv = data.shape[1] Mb = dict([(rv,[]) for rv in range(n_rv)]) N = data.shape[0] card = dict(zip(range(n_rv),unique_bins(data))) #card = dict(zip(range(data.shape[1]),np.amax(data,axis=0))) if feature_selection is None: _T = range(n_rv) else: assert (not isinstance(feature_selection, list)), 'feature_selection must be only one value' _T = [feature_selection] # LEARN MARKOV BLANKET for T in _T: S = set(range(n_rv)) - {T} for A in S: if not are_independent(data[:,(A,T)]): S.remove(A) s_h_dict = dict([(s,0) for s in S]) while S: insufficient_data = False break_grow_phase = False #### GROW PHASE #### # Calculate mutual information for all variables mi_dict = dict([(s,mi_test(data[:,(s,T)+tuple(Mb[T])])) for s in S]) for x_i in sorted(mi_dict, key=mi_dict.get,reverse=True): # Add top MI-score variables until there isn't enough data for bins if (N / card[x_i]*card[T]*np.prod([card[b] for b in Mb[T]])) >= k: Mb[T].append(x_i) else: insufficient_data = True break #### SHRINK PHASE #### removed_vars = False for A in Mb[T]: cols = (A,T) + tuple(set(Mb[T]) - {A}) # if A is independent of T given Mb[T], remove A if are_independent(data[:,cols]): Mb[T].remove(A) removed_vars=True #### FINALIZE BLANKET FOR "T" OR MAKE ANOTHER PASS #### if insufficient_data and not removed_vars: if debug: print 'Breaking..' break else: A = set(range(n_rv)) - {T} - set(Mb[T]) #A = set(nodes) - {T} - set(Mb[T]) S = set() for a in A: cols = (a,T) + tuple(Mb[T]) if are_independent(data[:,cols]): S.add(a) if debug: print 'Done with %s' % T if feature_selection is None: # RESOLVE GRAPH STRUCTURE edge_dict = resolve_markov_blanket(Mb, data) # ORIENT EDGES oriented_edge_dict = orient_edges_MB(edge_dict,Mb,data,alpha) # CREATE BAYESNET OBJECT bn=BayesNet(oriented_edge_dict,value_dict) return BN else: return Mb[_T]
def mi_test(data, test=True): """ This function performs the mutual information (cross entropy)-based CONDITIONAL independence test. Because it is conditional, it requires at LEAST three columns. For the marginal independence test, use "mi_test_marginal". We use the maximum likelihood estimators as probabilities. The mutual information value is computed, then the chi square test is used, with degrees of freedom equal to (|X|-1)*(|Y|-1)*Pi_z\inZ(|z|). This function works on datasets that contain MORE than three columns by concatenating the extra columns into one. For that reason, it is a little slower in that case. For two variables only: This function performs mutual information (cross entropy)-based MARGINAL independence test. Because it is marginal, it requires EXACTLY TWO columns. For the conditional independence test, use "mi_test_conditional". This is the same as calculated the KL Divergence, i.e. I(X,Y) = Sigma p(x,y)* log p(x,y) *( p(x)/p(y) ) NOTE: pval < 0.05 means DEPENDENCE, pval > 0.05 means INDEPENDENCE. In other words, the pval represent the probability this relationship could have happened at random or by chance. if the pval is very small, it means the two variables are likely dependent on one another. Steps: - Calculate the marginal/conditional probabilities - Compute the Mutual Information value - Calculate chi2 statistic = 2*N*MI - Compute the degrees of freedom - Compute the chi square p-value Arguments ---------- *data* : a nested numpy array The data from which to learn - must have at least three variables. All conditioned variables (i.e. Z) are compressed into one variable. Returns ------- *p_val* : a float The pvalue from the chi2 and ddof Effects ------- None Notes ----- - Doesn't currently work with strings... - Should generalize to let data be a Pandas DataFrame --> would encourage external use. """ #bins = np.amax(data, axis=0)+1 # read levels for each variable bins = unique_bins(data) if len(bins) == 2: hist, _ = np.histogramdd(data, bins=bins[0:2]) # frequency counts #Pxy = hist / hist.sum()# joint probability distribution over X,Y,Z Pxy = hist / data.shape[0] Px = np.sum(Pxy, axis=1) # P(X,Z) Py = np.sum(Pxy, axis=0) # P(Y,Z) PxPy = np.outer(Px, Py) Pxy += 1e-7 PxPy += 1e-7 MI = np.sum(Pxy * np.log(Pxy / (PxPy))) if not test: return round(MI, 4) else: chi2_statistic = 2 * len(data) * MI ddof = (bins[0] - 1) * (bins[1] - 1) p_val = 2 * stats.chi2.pdf(chi2_statistic, ddof) return round(p_val, 4) else: # CHECK FOR > 3 COLUMNS -> concatenate Z into one column if len(bins) > 3: data = data.astype('str') ncols = len(bins) for i in xrange(len(data)): data[i, 2] = ''.join(data[i, 2:ncols]) data = data.astype('int')[:, 0:3] #bins = np.amax(data,axis=0) bins = unique_bins(data) hist, _ = np.histogramdd(data, bins=bins) # frequency counts #Pxyz = hist / hist.sum()# joint probability distribution over X,Y,Z Pxyz = hist / data.shape[0] Pz = np.sum(Pxyz, axis=(0, 1)) # P(Z) Pxz = np.sum(Pxyz, axis=1) # P(X,Z) Pyz = np.sum(Pxyz, axis=0) # P(Y,Z) Pxy_z = Pxyz / (Pz + 1e-7) # P(X,Y | Z) = P(X,Y,Z) / P(Z) Px_z = Pxz / (Pz + 1e-7) # P(X | Z) = P(X,Z) / P(Z) Py_z = Pyz / (Pz + 1e-7) # P(Y | Z) = P(Y,Z) / P(Z) Px_y_z = np.empty((Pxy_z.shape)) # P(X|Z)P(Y|Z) for i in xrange(bins[0]): for j in xrange(bins[1]): for k in xrange(bins[2]): Px_y_z[i][j][k] = Px_z[i][k] * Py_z[j][k] Pxyz += 1e-7 Pxy_z += 1e-7 Px_y_z += 1e-7 MI = np.sum(Pxyz * np.log(Pxy_z / (Px_y_z))) if not test: return round(MI, 4) else: chi2_statistic = 2 * len(data) * MI ddof = (bins[0] - 1) * (bins[1] - 1) * bins[2] p_val = 2 * stats.chi2.pdf(chi2_statistic, ddof) # 2* for one tail return round(p_val, 4)
def chi2_test(data): """ Test null hypothesis that P(X,Y,Z) = P(Z)P(X|Z)P(Y|Z) versus empirically observed P(X,Y,Z) in the data using the traditional chisquare test based on observed versus expected frequency bins. Steps - Calculate P(XYZ) empirically and expected - Compute ddof - Perfom one-way chisquare Arguments --------- *data* : a nested numpy array The data from which to learn - must have at least three variables. All conditioned variables (i.e. Z) are compressed into one variable. Returns ------- *chi2_statistic* : a float Chisquare statistic *p_val* : a float The pvalue from the chi2 and ddof Effects ------- None Notes ----- - Assuming for now that |Z| = 1... generalize later - Should generalize to let data be a Pandas DataFrame --> would encourage external use. """ # compress extra Z variables at the start.. not implemented yet #bins = np.amax(data, axis=0)+1 bins = unique_bins(data) hist,_ = np.histogramdd(data,bins=bins) Pxyz = hist / hist.sum()# joint probability distribution over X,Y,Z Pz = np.sum(Pxyz, axis = (0,1)) # P(Z) Pxz = np.sum(Pxyz, axis = 1) # P(X,Z) Pyz = np.sum(Pxyz, axis = 0) # P(Y,Z) Px_z = Pxz / (Pz+1e-7) # P(X | Z) = P(X,Z) / P(Z) Py_z = Pyz / (Pz+1e-7) # P(Y | Z) = P(Y,Z) / P(Z) observed_dist = Pxyz # Empirical distribution #Not correct right now -> Pz is wrong dimension Px_y_z = np.empty((Pxy_z.shape)) # P(Z)P(X|Z)P(Y|Z) for i in xrange(bins[0]): for j in xrange(bins[1]): for k in xrange(bins[2]): Px_y_z[i][j][k] = Px_z[i][k]*Py_z[j][k] Px_y_z *= Pz observed = observed_dist.flatten() * len(data) expected = expected_dist.flatten() * len(data) ddof = (bins[0] - 1) * (bins[1]- 1) * bins[2] chi2_statistic, p_val = stats.chisquare(observed,expected, ddof=ddof) return chi2_statistic, p_val
def mutual_information(data, conditional=False): #bins = np.amax(data, axis=0)+1 # read levels for each variable bins = unique_bins(data) if len(bins) == 1: hist,_ = np.histogramdd(data, bins=(bins)) # frequency counts Px = hist/hist.sum() MI = -1 * np.sum( Px * np.log( Px ) ) return round(MI, 4) if len(bins) == 2: hist,_ = np.histogramdd(data, bins=bins[0:2]) # frequency counts Pxy = hist / hist.sum()# joint probability distribution over X,Y,Z Px = np.sum(Pxy, axis = 1) # P(X,Z) Py = np.sum(Pxy, axis = 0) # P(Y,Z) PxPy = np.outer(Px,Py) Pxy += 1e-7 PxPy += 1e-7 MI = np.sum(Pxy * np.log(Pxy / (PxPy))) return round(MI,4) elif len(bins) > 2 and conditional==True: # CHECK FOR > 3 COLUMNS -> concatenate Z into one column if len(bins) > 3: data = data.astype('str') ncols = len(bins) for i in xrange(len(data)): data[i,2] = ''.join(data[i,2:ncols]) data = data.astype('int')[:,0:3] bins = np.amax(data,axis=0) hist,_ = np.histogramdd(data, bins=bins) # frequency counts Pxyz = hist / hist.sum()# joint probability distribution over X,Y,Z Pz = np.sum(Pxyz, axis = (0,1)) # P(Z) Pxz = np.sum(Pxyz, axis = 1) # P(X,Z) Pyz = np.sum(Pxyz, axis = 0) # P(Y,Z) Pxy_z = Pxyz / (Pz+1e-7) # P(X,Y | Z) = P(X,Y,Z) / P(Z) Px_z = Pxz / (Pz+1e-7) # P(X | Z) = P(X,Z) / P(Z) Py_z = Pyz / (Pz+1e-7) # P(Y | Z) = P(Y,Z) / P(Z) Px_y_z = np.empty((Pxy_z.shape)) # P(X|Z)P(Y|Z) for i in xrange(bins[0]): for j in xrange(bins[1]): for k in xrange(bins[2]): Px_y_z[i][j][k] = Px_z[i][k]*Py_z[j][k] Pxyz += 1e-7 Pxy_z += 1e-7 Px_y_z += 1e-7 MI = np.sum(Pxyz * np.log(Pxy_z / (Px_y_z))) return round(MI,4) elif len(bins) > 2 and conditional == False: data = data.astype('str') ncols = len(bins) for i in xrange(len(data)): data[i,1] = ''.join(data[i,1:ncols]) data = data.astype('int')[:,0:2] hist,_ = np.histogramdd(data, bins=bins[0:2]) # frequency counts Pxy = hist / hist.sum()# joint probability distribution over X,Y,Z Px = np.sum(Pxy, axis = 1) # P(X,Z) Py = np.sum(Pxy, axis = 0) # P(Y,Z) PxPy = np.outer(Px,Py) Pxy += 1e-7 PxPy += 1e-7 MI = np.sum(Pxy * np.log(Pxy / (PxPy))) return round(MI,4)
def entropy(data): """ In the context of structure learning, and more specifically in constraint-based algorithms which rely on the mutual information test for conditional independence, it has been proven that the variable X in a set which MAXIMIZES mutual information is also the variable which MINIMIZES entropy. This fact can be used to reduce the computational requirements of tests based on the following relationship: Entropy is related to marginal mutual information as follows: MI(X;Y) = H(X) - H(X|Y) Entropy is related to conditional mutual information as follows: MI(X;Y|Z) = H(X|Z) - H(X|Y,Z) For one varibale, H(X) is equal to the following: -1 * sum of p(x) * log(p(x)) For two variables H(X|Y) is equal to the following: sum over x,y of p(x,y)*log(p(y)/p(x,y)) For three variables, H(X|Y,Z) is equal to the following: -1 * sum of p(x,y,z) * log(p(x|y,z)), where p(x|y,z) = p(x,y,z)/p(y)*p(z) Arguments ---------- *data* : a nested numpy array The data from which to learn - must have at least three variables. All conditioned variables (i.e. Z) are compressed into one variable. Returns ------- *H* : entropy value """ try: cols = data.shape[1] except IndexError: cols = 1 #bins = np.amax(data,axis=0) bins = unique_bins(data) if cols == 1: hist,_ = np.histogramdd(data, bins=(bins)) # frequency counts Px = hist/hist.sum() H = -1 * np.sum( Px * np.log( Px ) ) elif cols == 2: # two variables -> assume X then Y hist,_ = np.histogramdd(data, bins=bins[0:2]) # frequency counts Pxy = hist / hist.sum()# joint probability distribution over X,Y,Z Py = np.sum(Pxy, axis = 0) # P(Y) Py += 1e-7 Pxy += 1e-7 H = np.sum( Pxy * np.log( Py / Pxy ) ) else: # CHECK FOR > 3 COLUMNS -> concatenate Z into one column if cols > 3: data = data.astype('str') ncols = len(bins) for i in xrange(len(data)): data[i,2] = ''.join(data[i,2:ncols]) data = data.astype('int')[:,0:3] bins = np.amax(data,axis=0) hist,_ = np.histogramdd(data, bins=bins) # frequency counts Pxyz = hist / hist.sum()# joint probability distribution over X,Y,Z Pyz = np.sum(Pxyz, axis=0) Pxyz += 1e-7 # for log -inf Pyz += 1e-7 H = -1 * np.sum( Pxyz * np.log( Pxyz ) ) + np.sum( Pyz * np.log( Pyz ) ) return round(H,4)
def mi_test(data, test=True): """ This function performs the mutual information (cross entropy)-based CONDITIONAL independence test. Because it is conditional, it requires at LEAST three columns. For the marginal independence test, use "mi_test_marginal". We use the maximum likelihood estimators as probabilities. The mutual information value is computed, then the chi square test is used, with degrees of freedom equal to (|X|-1)*(|Y|-1)*Pi_z\inZ(|z|). This function works on datasets that contain MORE than three columns by concatenating the extra columns into one. For that reason, it is a little slower in that case. For two variables only: This function performs mutual information (cross entropy)-based MARGINAL independence test. Because it is marginal, it requires EXACTLY TWO columns. For the conditional independence test, use "mi_test_conditional". This is the same as calculated the KL Divergence, i.e. I(X,Y) = Sigma p(x,y)* log p(x,y) *( p(x)/p(y) ) NOTE: pval < 0.05 means DEPENDENCE, pval > 0.05 means INDEPENDENCE. In other words, the pval represent the probability this relationship could have happened at random or by chance. if the pval is very small, it means the two variables are likely dependent on one another. Steps: - Calculate the marginal/conditional probabilities - Compute the Mutual Information value - Calculate chi2 statistic = 2*N*MI - Compute the degrees of freedom - Compute the chi square p-value Arguments ---------- *data* : a nested numpy array The data from which to learn - must have at least three variables. All conditioned variables (i.e. Z) are compressed into one variable. Returns ------- *p_val* : a float The pvalue from the chi2 and ddof Effects ------- None Notes ----- - Doesn't currently work with strings... - Should generalize to let data be a Pandas DataFrame --> would encourage external use. """ #bins = np.amax(data, axis=0)+1 # read levels for each variable bins = unique_bins(data) if len(bins)==2: hist,_ = np.histogramdd(data, bins=bins[0:2]) # frequency counts #Pxy = hist / hist.sum()# joint probability distribution over X,Y,Z Pxy = hist / data.shape[0] Px = np.sum(Pxy, axis = 1) # P(X,Z) Py = np.sum(Pxy, axis = 0) # P(Y,Z) PxPy = np.outer(Px,Py) Pxy += 1e-7 PxPy += 1e-7 MI = np.sum(Pxy * np.log(Pxy / (PxPy))) if not test: return round(MI,4) else: chi2_statistic = 2*len(data)*MI ddof = (bins[0] - 1) * (bins[1] - 1) p_val = 2*stats.chi2.pdf(chi2_statistic, ddof) return round(p_val,4) else: # CHECK FOR > 3 COLUMNS -> concatenate Z into one column if len(bins) > 3: data = data.astype('str') ncols = len(bins) for i in xrange(len(data)): data[i,2] = ''.join(data[i,2:ncols]) data = data.astype('int')[:,0:3] #bins = np.amax(data,axis=0) bins = unique_bins(data) hist,_ = np.histogramdd(data, bins=bins) # frequency counts #Pxyz = hist / hist.sum()# joint probability distribution over X,Y,Z Pxyz = hist / data.shape[0] Pz = np.sum(Pxyz, axis = (0,1)) # P(Z) Pxz = np.sum(Pxyz, axis = 1) # P(X,Z) Pyz = np.sum(Pxyz, axis = 0) # P(Y,Z) Pxy_z = Pxyz / (Pz+1e-7) # P(X,Y | Z) = P(X,Y,Z) / P(Z) Px_z = Pxz / (Pz+1e-7) # P(X | Z) = P(X,Z) / P(Z) Py_z = Pyz / (Pz+1e-7) # P(Y | Z) = P(Y,Z) / P(Z) Px_y_z = np.empty((Pxy_z.shape)) # P(X|Z)P(Y|Z) for i in xrange(bins[0]): for j in xrange(bins[1]): for k in xrange(bins[2]): Px_y_z[i][j][k] = Px_z[i][k]*Py_z[j][k] Pxyz += 1e-7 Pxy_z += 1e-7 Px_y_z += 1e-7 MI = np.sum(Pxyz * np.log(Pxy_z / (Px_y_z))) if not test: return round(MI,4) else: chi2_statistic = 2*len(data)*MI ddof = (bins[0] - 1) * (bins[1] - 1) * bins[2] p_val = 2*stats.chi2.pdf(chi2_statistic, ddof) # 2* for one tail return round(p_val,4)
def fast_iamb(data, k=5, alpha=0.05, feature_selection=None, debug=False): """ From [1]: "A novel algorithm for the induction of Markov blankets from data, called Fast-IAMB, that employs a heuristic to quickly recover the Markov blanket. Empirical results show that Fast-IAMB performs in many cases faster and more reliably than existing algorithms without adversely affecting the accuracy of the recovered Markov blankets." Arguments --------- *data* : a nested numpy array *k* : an integer The max number of edges to add at each iteration of the algorithm. *alpha* : a float Probability of Type I error Returns ------- *bn* : a BayesNet object Effects ------- None Notes ----- - Currently does not work. I think it's stuck in an infinite loop... """ # get values value_dict = dict( zip(range(data.shape[1]), [list(np.unique(col)) for col in data.T])) # replace strings data = replace_strings(data) n_rv = data.shape[1] Mb = dict([(rv, []) for rv in range(n_rv)]) N = data.shape[0] card = dict(zip(range(n_rv), unique_bins(data))) #card = dict(zip(range(data.shape[1]),np.amax(data,axis=0))) if feature_selection is None: _T = range(n_rv) else: assert (not isinstance(feature_selection, list) ), 'feature_selection must be only one value' _T = [feature_selection] # LEARN MARKOV BLANKET for T in _T: S = set(range(n_rv)) - {T} for A in S: if not are_independent(data[:, (A, T)]): S.remove(A) s_h_dict = dict([(s, 0) for s in S]) while S: insufficient_data = False break_grow_phase = False #### GROW PHASE #### # Calculate mutual information for all variables mi_dict = dict([(s, mi_test(data[:, (s, T) + tuple(Mb[T])])) for s in S]) for x_i in sorted(mi_dict, key=mi_dict.get, reverse=True): # Add top MI-score variables until there isn't enough data for bins if (N / card[x_i] * card[T] * np.prod([card[b] for b in Mb[T]])) >= k: Mb[T].append(x_i) else: insufficient_data = True break #### SHRINK PHASE #### removed_vars = False for A in Mb[T]: cols = (A, T) + tuple(set(Mb[T]) - {A}) # if A is independent of T given Mb[T], remove A if are_independent(data[:, cols]): Mb[T].remove(A) removed_vars = True #### FINALIZE BLANKET FOR "T" OR MAKE ANOTHER PASS #### if insufficient_data and not removed_vars: if debug: print 'Breaking..' break else: A = set(range(n_rv)) - {T} - set(Mb[T]) #A = set(nodes) - {T} - set(Mb[T]) S = set() for a in A: cols = (a, T) + tuple(Mb[T]) if are_independent(data[:, cols]): S.add(a) if debug: print 'Done with %s' % T if feature_selection is None: # RESOLVE GRAPH STRUCTURE edge_dict = resolve_markov_blanket(Mb, data) # ORIENT EDGES oriented_edge_dict = orient_edges_MB(edge_dict, Mb, data, alpha) # CREATE BAYESNET OBJECT bn = BayesNet(oriented_edge_dict, value_dict) return BN else: return Mb[_T]